From: Bruce Richardson Date: Wed, 6 Mar 2019 16:22:38 +0000 (+0000) Subject: eal/linux: rename linuxapp to linux X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=91d7846ce68d7c637cd7efde751f5276475aa9c8;p=dpdk.git eal/linux: rename linuxapp to linux The term "linuxapp" is a legacy one, but just calling the subdirectory "linux" is just clearer for all concerned. Signed-off-by: Bruce Richardson --- diff --git a/MAINTAINERS b/MAINTAINERS index d1e624c9a3..452b8eb828 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -173,7 +173,7 @@ F: lib/librte_eal/common/*malloc* F: lib/librte_eal/common/eal_common_fbarray.c F: lib/librte_eal/common/eal_common_mem* F: lib/librte_eal/common/eal_hugepages.h -F: lib/librte_eal/linuxapp/eal/eal_mem* +F: lib/librte_eal/linux/eal/eal_mem* F: lib/librte_eal/freebsd/eal/eal_mem* F: doc/guides/prog_guide/env_abstraction_layer.rst F: app/test/test_external_mem.c @@ -244,8 +244,8 @@ F: lib/librte_eal/common/arch/x86/ F: lib/librte_eal/common/include/arch/x86/ Linux EAL (with overlaps) -F: lib/librte_eal/linuxapp/Makefile -F: lib/librte_eal/linuxapp/eal/ +F: lib/librte_eal/linux/Makefile +F: lib/librte_eal/linux/eal/ F: doc/guides/linux_gsg/ Linux UIO @@ -255,7 +255,7 @@ F: drivers/bus/pci/linux/*uio* Linux VFIO M: Anatoly Burakov -F: lib/librte_eal/linuxapp/eal/*vfio* +F: lib/librte_eal/linux/eal/*vfio* F: drivers/bus/pci/linux/*vfio* FreeBSD EAL (with overlaps) diff --git a/devtools/build-tags.sh b/devtools/build-tags.sh index 3a98e9b06e..a10a38fa9b 100755 --- a/devtools/build-tags.sh +++ b/devtools/build-tags.sh @@ -67,7 +67,7 @@ common_sources() linux_sources() { - find_sources "lib/librte_eal/linuxapp" '*.[chS]' + find_sources "lib/librte_eal/linux" '*.[chS]' } bsd_sources() diff --git a/doc/guides/prog_guide/env_abstraction_layer.rst b/doc/guides/prog_guide/env_abstraction_layer.rst index 73436b0eb8..e1d80c02a2 100644 --- a/doc/guides/prog_guide/env_abstraction_layer.rst +++ b/doc/guides/prog_guide/env_abstraction_layer.rst @@ -346,7 +346,7 @@ To ease the idle polling with tiny throughput, it's useful to pause the polling The RX interrupt is the first choice to be such kind of wake-up event, but probably won't be the only one. EAL provides the event APIs for this event-driven thread mode. -Taking linuxapp as an example, the implementation relies on epoll. Each thread can monitor an epoll instance +Taking Linux as an example, the implementation relies on epoll. Each thread can monitor an epoll instance in which all the wake-up events' file descriptors are added. The event file descriptors are created and mapped to the interrupt vectors according to the UIO/VFIO spec. From FreeBSD's perspective, kqueue is the alternative way, but not implemented yet. diff --git a/drivers/bus/dpaa/Makefile b/drivers/bus/dpaa/Makefile index 800e5cd20d..248c024eba 100644 --- a/drivers/bus/dpaa/Makefile +++ b/drivers/bus/dpaa/Makefile @@ -17,7 +17,7 @@ CFLAGS += -Wno-cast-qual CFLAGS += -I$(RTE_BUS_DPAA)/ CFLAGS += -I$(RTE_BUS_DPAA)/include CFLAGS += -I$(RTE_BUS_DPAA)/base/qbman -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include # versioning export map diff --git a/drivers/crypto/caam_jr/Makefile b/drivers/crypto/caam_jr/Makefile index 88cdf74108..b078453d7f 100644 --- a/drivers/crypto/caam_jr/Makefile +++ b/drivers/crypto/caam_jr/Makefile @@ -21,7 +21,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/crypto/caam_jr #sharing the hw flib headers from dpaa2_sec pmd CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa2_sec/ CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal # versioning export map EXPORT_MAP := rte_pmd_caam_jr_version.map diff --git a/drivers/crypto/dpaa2_sec/Makefile b/drivers/crypto/dpaa2_sec/Makefile index f537f76a6f..63dbe85278 100644 --- a/drivers/crypto/dpaa2_sec/Makefile +++ b/drivers/crypto/dpaa2_sec/Makefile @@ -27,7 +27,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2/ -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal # versioning export map EXPORT_MAP := rte_pmd_dpaa2_sec_version.map diff --git a/drivers/crypto/dpaa_sec/Makefile b/drivers/crypto/dpaa_sec/Makefile index 5ce95c23fd..aa214c032a 100644 --- a/drivers/crypto/dpaa_sec/Makefile +++ b/drivers/crypto/dpaa_sec/Makefile @@ -20,7 +20,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa_sec/ #sharing the hw flib headers from dpaa2_sec pmd CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa2_sec/ CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring LDLIBS += -lrte_cryptodev diff --git a/drivers/event/dpaa/Makefile b/drivers/event/dpaa/Makefile index 6f93e7f406..9b3d6bac4c 100644 --- a/drivers/event/dpaa/Makefile +++ b/drivers/event/dpaa/Makefile @@ -20,7 +20,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa/include/ CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include EXPORT_MAP := rte_pmd_dpaa_event_version.map diff --git a/drivers/event/dpaa2/Makefile b/drivers/event/dpaa2/Makefile index e0134cc460..e245682cf8 100644 --- a/drivers/event/dpaa2/Makefile +++ b/drivers/event/dpaa2/Makefile @@ -17,7 +17,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2 CFLAGS += -I$(RTE_SDK)/drivers/event/dpaa2 -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal LDLIBS += -lrte_eal -lrte_eventdev LDLIBS += -lrte_bus_fslmc -lrte_mempool_dpaa2 -lrte_pmd_dpaa2 LDLIBS += -lrte_bus_vdev diff --git a/drivers/mempool/dpaa2/Makefile b/drivers/mempool/dpaa2/Makefile index 96c0f2b632..5f3e4eae94 100644 --- a/drivers/mempool/dpaa2/Makefile +++ b/drivers/mempool/dpaa2/Makefile @@ -13,7 +13,7 @@ CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal # versioning export map EXPORT_MAP := rte_mempool_dpaa2_version.map diff --git a/drivers/net/dpaa/Makefile b/drivers/net/dpaa/Makefile index 1c4f7d914c..5b8e7f8da5 100644 --- a/drivers/net/dpaa/Makefile +++ b/drivers/net/dpaa/Makefile @@ -21,7 +21,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa/base/qbman CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa CFLAGS += -I$(RTE_SDK)/drivers/event/dpaa CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include EXPORT_MAP := rte_pmd_dpaa_version.map diff --git a/drivers/net/dpaa2/Makefile b/drivers/net/dpaa2/Makefile index 8bd269bfab..947fb985a9 100644 --- a/drivers/net/dpaa2/Makefile +++ b/drivers/net/dpaa2/Makefile @@ -19,7 +19,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2 -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal # versioning export map EXPORT_MAP := rte_pmd_dpaa2_version.map diff --git a/drivers/raw/dpaa2_qdma/Makefile b/drivers/raw/dpaa2_qdma/Makefile index bdd99c9768..5c75f5fa0c 100644 --- a/drivers/raw/dpaa2_qdma/Makefile +++ b/drivers/raw/dpaa2_qdma/Makefile @@ -12,7 +12,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API CFLAGS += -O3 CFLAGS += $(WERROR_FLAGS) -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include diff --git a/kernel/linux/kni/meson.build b/kernel/linux/kni/meson.build index a09af5aa5f..877ff5c474 100644 --- a/kernel/linux/kni/meson.build +++ b/kernel/linux/kni/meson.build @@ -21,7 +21,7 @@ custom_target('rte_kni', 'src=' + meson.current_source_dir(), 'MODULE_CFLAGS=-include ' + meson.source_root() + '/config/rte_config.h' + ' -I' + meson.source_root() + '/lib/librte_eal/common/include' + - ' -I' + meson.source_root() + '/lib/librte_eal/linuxapp/eal/include' + + ' -I' + meson.source_root() + '/lib/librte_eal/linux/eal/include' + ' -I' + meson.build_root() + ' -I' + meson.current_source_dir() + ' -I' + meson.current_source_dir() + '/ethtool/ixgbe' + diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile index 39d64bb7aa..c6bd39f023 100644 --- a/lib/librte_eal/Makefile +++ b/lib/librte_eal/Makefile @@ -4,8 +4,8 @@ include $(RTE_SDK)/mk/rte.vars.mk DIRS-y += common -DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp -DEPDIRS-linuxapp := common +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linux +DEPDIRS-linux := common DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += freebsd DEPDIRS-freebsd := common diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile new file mode 100644 index 0000000000..a0fffa98e8 --- /dev/null +++ b/lib/librte_eal/linux/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2014 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal +DEPDIRS-kni := eal + +CFLAGS += -DALLOW_EXPERIMENTAL_API + +include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/lib/librte_eal/linux/eal/Makefile b/lib/librte_eal/linux/eal/Makefile new file mode 100644 index 0000000000..51deb57974 --- /dev/null +++ b/lib/librte_eal/linux/eal/Makefile @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2016 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) + +EXPORT_MAP := ../../rte_eal_version.map +VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) + +LIBABIVER := 9 + +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt +LDLIBS += -lrte_kvargs +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) +LDLIBS += -lnuma +endif + +# specific to linuxapp exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_kni_common.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ + $(addprefix include/exec-env/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c new file mode 100644 index 0000000000..13f4016841 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal.c @@ -0,0 +1,1336 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2012-2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(RTE_ARCH_X86) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" +#include "eal_vfio.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, eal_get_hugefile_prefix()); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +int +eal_clean_runtime_dir(void) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + static const char * const filters[] = { + "fbarray_*", + "mp_socket_*" + }; + + /* open directory */ + dir = opendir(runtime_dir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n", + runtime_dir); + goto error; + } + dir_fd = dirfd(dir); + + /* lock the directory before doing anything, to avoid races */ + if (flock(dir_fd, LOCK_EX) < 0) { + RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n", + runtime_dir); + goto error; + } + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n", + runtime_dir); + goto error; + } + + while (dirent != NULL) { + unsigned int f_idx; + bool skip = true; + + /* skip files that don't match the patterns */ + for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) { + const char *filter = filters[f_idx]; + + if (fnmatch(filter, dirent->d_name, 0) == 0) { + skip = false; + break; + } + } + if (skip) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close(fd); + dirent = readdir(dir); + } + + /* closedir closes dir_fd and drops the lock */ + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n", + strerror(errno)); + + return -1; +} + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static void +rte_eal_config_create(void) +{ + void *rte_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); + if (retval < 0){ + close(mem_cfg_fd); + rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + } + + rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), + PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); + + if (rte_mem_cfg_addr == MAP_FAILED){ + rte_panic("Cannot mmap memory for rte_config\n"); + } + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + + rte_config.mem_config->dma_maskbits = 0; + +} + +/* attach to an existing shared memory config */ +static void +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) + rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + + rte_config.mem_config = mem_config; +} + +/* reattach the shared config at exact memory location primary process has it */ +static void +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) + /* errno is stale, don't use */ + rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--base-virtaddr' option\n", + rte_mem_cfg_addr, mem_config); + else + rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + } + close(mem_cfg_fd); + + rte_config.mem_config = mem_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* copies data from internal config to shared config */ +static void +eal_update_mem_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + mcfg->legacy_mem = internal_config.legacy_mem; + mcfg->single_file_segments = internal_config.single_file_segments; +} + +/* copies data from shared config to internal config */ +static void +eal_update_internal_config(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + internal_config.legacy_mem = mcfg->legacy_mem; + internal_config.single_file_segments = mcfg->single_file_segments; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static void +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + rte_eal_config_create(); + eal_update_mem_config(); + break; + case RTE_PROC_SECONDARY: + rte_eal_config_attach(); + rte_eal_mcfg_wait_complete(rte_config.mem_config); + rte_eal_config_reattach(); + eal_update_internal_config(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + rte_panic("Invalid process type\n"); + } +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_BASE_VIRTADDR" Base virtual address\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" + " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" + " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(strval, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(strval[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(strval, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + uint64_t val; + end = NULL; + val = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + val <<= 20; + total_mem += val; + socket_arg[i] = val; + } + + return 0; +} + +static int +eal_parse_base_virtaddr(const char *arg) +{ + char *end; + uint64_t addr; + + errno = 0; + addr = strtoull(arg, &end, 16); + + /* check for errors */ + if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) + return -1; + + /* make sure we don't exceed 32-bit boundary on 32-bit target */ +#ifndef RTE_ARCH_64 + if (addr >= UINTPTR_MAX) + return -1; +#endif + + /* align the addr on 16M boundary, 16MB is the minimum huge page + * size on IBM Power architecture. If the addr is aligned to 16MB, + * it can align to 2MB for x86. So this alignment can also be used + * on x86 */ + internal_config.base_virtaddr = + RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + opterr = 0; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ + if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + case OPT_HUGE_DIR_NUM: + { + char *hdir = strdup(optarg); + if (hdir == NULL) + RTE_LOG(ERR, EAL, "Could not store hugepage directory\n"); + else { + /* free old hugepage dir */ + if (internal_config.hugepage_dir != NULL) + free(internal_config.hugepage_dir); + internal_config.hugepage_dir = hdir; + } + break; + } + case OPT_FILE_PREFIX_NUM: + { + char *prefix = strdup(optarg); + if (prefix == NULL) + RTE_LOG(ERR, EAL, "Could not store file prefix\n"); + else { + /* free old prefix */ + if (internal_config.hugefile_prefix != NULL) + free(internal_config.hugefile_prefix); + internal_config.hugefile_prefix = prefix; + } + break; + } + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_mem) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_sockets = 1; + break; + + case OPT_SOCKET_LIMIT_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_limit) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_LIMIT "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_socket_limits = 1; + break; + + case OPT_BASE_VIRTADDR_NUM: + if (eal_parse_base_virtaddr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameter for --" + OPT_BASE_VIRTADDR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + case OPT_MBUF_POOL_OPS_NAME_NUM: + { + char *ops_name = strdup(optarg); + if (ops_name == NULL) + RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); + else { + /* free old ops name */ + if (internal_config.user_mbuf_pool_ops_name != + NULL) + free(internal_config.user_mbuf_pool_ops_name); + + internal_config.user_mbuf_pool_ops_name = + ops_name; + } + break; + } + case OPT_MATCH_ALLOCATIONS_NUM: + internal_config.match_allocations = 1; + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->external) + return 0; + + return *socket_id == msl->socket_id; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +inline static void +rte_eal_mcfg_complete(void) +{ + /* ALL shared mem_config related INIT DONE */ + if (rte_config.process_type == RTE_PROC_PRIMARY) + rte_config.mem_config->magic = RTE_MAGIC; + + internal_config.init_complete = 1; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; +#endif + return 0; +} + +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + if (rte_vfio_enable("vfio")) + return -1; + + return 0; +} +#endif + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + const char *p; + static char logid[PATH_MAX]; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + rte_config_init(); + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0) { + rte_eal_init_alert("failed to init mp channel"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + /* register multi-process action callbacks for hotplug */ + if (rte_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + rte_eal_get_configuration()->iova_mode = + rte_bus_get_iommu_class(); + + /* Workaround for KNI which requires physical address to work */ + if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, + "Some devices want IOVA as VA but PA will be used because.. " + "KNI module inserted\n"); + } + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + rte_srand(rte_rdtsc()); + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { + rte_eal_init_alert("Cannot init logging."); + rte_errno = ENOMEM; + rte_atomic32_clear(&run_once); + return -1; + } + +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO"); + rte_errno = EAGAIN; + rte_atomic32_clear(&run_once); + return -1; + } +#endif + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory"); + rte_errno = ENOMEM; + return -1; + } + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices"); + rte_errno = ENOTSUP; + return -1; + } + +#ifdef VFIO_PRESENT + /* Register mp action after probe() so that we got enough info */ + if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) + return -1; +#endif + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + /* + * Clean up unused files in runtime directory. We do this at the end of + * init and not at the beginning because we want to clean stuff up + * whether we are primary or secondary process, but we cannot remove + * primary process' files because secondary should be able to run even + * if primary process is dead. + * + * In no_shconf mode, no runtime directory is created in the first + * place, so no cleanup needed. + */ + if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { + rte_eal_init_alert("Cannot clear runtime directory\n"); + return -1; + } + + rte_eal_mcfg_complete(); + + /* Call each registered callback, if enabled */ + rte_option_init(); + + return fctret; +} + +static int +mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg __rte_unused) +{ + /* ms is const, so find this memseg */ + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); + + found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; + + return 0; +} + +int __rte_experimental +rte_eal_cleanup(void) +{ + /* if we're in a primary process, we need to mark hugepages as freeable + * so that finalization can release them back to the system. + */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_memseg_walk(mark_freeable, NULL); + rte_service_finalize(); + rte_mp_channel_cleanup(); + eal_cleanup_config(&internal_config); + return 0; +} + +/* get core role */ +enum rte_lcore_role_t +rte_eal_lcore_role(unsigned lcore_id) +{ + return rte_config.lcore_role[lcore_id]; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return internal_config.vfio_intr_mode; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/lib/librte_eal/linux/eal/eal_alarm.c b/lib/librte_eal/linux/eal/eal_alarm.c new file mode 100644 index 0000000000..840ede7806 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_alarm.c @@ -0,0 +1,243 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TFD_NONBLOCK +#include +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#ifndef US_PER_S +#define US_PER_S (US_PER_MS * MS_PER_S) +#endif + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + ret |= rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL); + handler_registered = (ret == 0) ? 1 : 0; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + return count; +} diff --git a/lib/librte_eal/linux/eal/eal_cpuflags.c b/lib/librte_eal/linux/eal/eal_cpuflags.c new file mode 100644 index 0000000000..d38296e1e5 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_cpuflags.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include +#define HAS_AUXV 1 +#endif +#endif + +#include + +#ifndef HAS_AUXV +static unsigned long +getauxval(unsigned long type __rte_unused) +{ + errno = ENOTSUP; + return 0; +} +#endif + +#ifdef RTE_ARCH_64 +typedef Elf64_auxv_t Internal_Elfx_auxv_t; +#else +typedef Elf32_auxv_t Internal_Elfx_auxv_t; +#endif + +/** + * Provides a method for retrieving values from the auxiliary vector and + * possibly running a string comparison. + * + * @return Always returns a result. When the result is 0, check errno + * to see if an error occurred during processing. + */ +static unsigned long +_rte_cpu_getauxval(unsigned long type, const char *str) +{ + unsigned long val; + + errno = 0; + val = getauxval(type); + + if (!val && (errno == ENOTSUP || errno == ENOENT)) { + int auxv_fd = open("/proc/self/auxv", O_RDONLY); + Internal_Elfx_auxv_t auxv; + + if (auxv_fd == -1) + return 0; + + errno = ENOENT; + while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv.a_type == type) { + errno = 0; + val = auxv.a_un.a_val; + if (str) + val = strcmp((const char *)val, str); + break; + } + } + close(auxv_fd); + } + + return val; +} + +unsigned long +rte_cpu_getauxval(unsigned long type) +{ + return _rte_cpu_getauxval(type, NULL); +} + +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str) +{ + return _rte_cpu_getauxval(type, str); +} diff --git a/lib/librte_eal/linux/eal/eal_debug.c b/lib/librte_eal/linux/eal/eal_debug.c new file mode 100644 index 0000000000..5d92500bf5 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/lib/librte_eal/linux/eal/eal_dev.c b/lib/librte_eal/linux/eal/eal_dev.c new file mode 100644 index 0000000000..2830c8687d --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_dev.c @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static bool monitor_started; +static bool hotplug_handle; + +#define EAL_UEV_MSG_LEN 4096 +#define EAL_UEV_MSG_ELEM_LEN 128 + +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + +static void dev_uev_handler(__rte_unused void *param); + +/* identify the system layer which reports this event. */ +enum eal_dev_event_subsystem { + EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ + EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_MAX +}; + +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n", + (int)pthread_self(), info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +static int +dev_uev_socket_fd_create(void) +{ + struct sockaddr_nl addr; + int ret; + + intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | + SOCK_NONBLOCK, + NETLINK_KOBJECT_UEVENT); + if (intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = 0; + addr.nl_groups = 0xffffffff; + + ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); + goto err; + } + + return 0; +err: + close(intr_handle.fd); + intr_handle.fd = -1; + return ret; +} + +static int +dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) +{ + char action[EAL_UEV_MSG_ELEM_LEN]; + char subsystem[EAL_UEV_MSG_ELEM_LEN]; + char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; + int i = 0; + + memset(action, 0, EAL_UEV_MSG_ELEM_LEN); + memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); + memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); + + while (i < length) { + for (; i < length; i++) { + if (*buf) + break; + buf++; + } + /** + * check device uevent from kernel side, no need to check + * uevent from udev. + */ + if (!strncmp(buf, "libudev", 7)) { + buf += 7; + i += 7; + return -1; + } + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + i += 7; + strlcpy(action, buf, sizeof(action)); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + i += 10; + strlcpy(subsystem, buf, sizeof(subsystem)); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + i += 14; + strlcpy(pci_slot_name, buf, sizeof(subsystem)); + event->devname = strdup(pci_slot_name); + } + for (; i < length; i++) { + if (*buf == '\0') + break; + buf++; + } + } + + /* parse the subsystem layer */ + if (!strncmp(subsystem, "uio", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; + else if (!strncmp(subsystem, "pci", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; + else if (!strncmp(subsystem, "vfio", 4)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; + else + return -1; + + /* parse the action type */ + if (!strncmp(action, "add", 3)) + event->type = RTE_DEV_EVENT_ADD; + else if (!strncmp(action, "remove", 6)) + event->type = RTE_DEV_EVENT_REMOVE; + else + return -1; + return 0; +} + +static void +dev_delayed_unregister(void *param) +{ + rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); + close(intr_handle.fd); + intr_handle.fd = -1; +} + +static void +dev_uev_handler(__rte_unused void *param) +{ + struct rte_dev_event uevent; + int ret; + char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; + + memset(&uevent, 0, sizeof(struct rte_dev_event)); + memset(buf, 0, EAL_UEV_MSG_LEN); + + ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); + if (ret < 0 && errno == EAGAIN) + return; + else if (ret <= 0) { + /* connection is closed or broken, can not up again. */ + RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); + rte_eal_alarm_set(1, dev_delayed_unregister, NULL); + return; + } + + ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "It is not an valid event " + "that need to be handle.\n"); + return; + } + + RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", + uevent.devname, uevent.type, uevent.subsystem); + + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + goto failure_handle_err; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + goto failure_handle_err; + } + + ret = bus->hot_unplug_handler(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + } + rte_spinlock_unlock(&failure_handle_lock); + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } + + return; + +failure_handle_err: + rte_spinlock_unlock(&failure_handle_lock); +} + +int __rte_experimental +rte_dev_event_monitor_start(void) +{ + int ret; + + if (monitor_started) + return 0; + + ret = dev_uev_socket_fd_create(); + if (ret) { + RTE_LOG(ERR, EAL, "error create device event fd.\n"); + return -1; + } + + intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; + ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); + + if (ret) { + RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); + return -1; + } + + monitor_started = true; + + return 0; +} + +int __rte_experimental +rte_dev_event_monitor_stop(void) +{ + int ret; + + if (!monitor_started) + return 0; + + ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, + (void *)-1); + if (ret < 0) { + RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); + return ret; + } + + close(intr_handle.fd); + intr_handle.fd = -1; + monitor_started = false; + + return 0; +} + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int __rte_experimental +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int __rte_experimental +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/lib/librte_eal/linux/eal/eal_hugepage_info.c b/lib/librte_eal/linux/eal/eal_hugepage_info.c new file mode 100644 index 0000000000..0eab1cf719 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_hugepage_info.c @@ -0,0 +1,526 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* for hugetlb-related flags */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; +static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + char path[PATH_MAX]; + long unsigned resv_pages, num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + + /* first, check how many reserved pages kernel reports */ + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_rsvd_file); + if (eal_parse_sysfs_value(path, &resv_pages) < 0) + return 0; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint32_t +get_num_hugepages_on_node(const char *subdir, unsigned int socket) +{ + char path[PATH_MAX], socketpath[PATH_MAX]; + DIR *socketdir; + unsigned long num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + + snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", + sys_pages_numa_dir_path, socket); + + socketdir = opendir(socketpath); + if (socketdir) { + /* Keep calm and carry on */ + closedir(socketdir); + } else { + /* Can't find socket dir, so ignore it */ + return 0; + } + + snprintf(path, sizeof(path), "%s/%s/%s", + socketpath, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* + * we want to return a uint32_t and more than this looks suspicious + * anyway ... + */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static int +get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + int retval = -1; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + +static int +hugepage_info_init(void) +{ const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned int i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) { + RTE_LOG(ERR, EAL, + "Cannot open directory %s to read system hugepage info\n", + sys_dir_path); + return -1; + } + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + + /* first, check if we have a mountpoint */ + if (get_hugepage_dir(hpi->hugepage_sz, + hpi->hugedir, sizeof(hpi->hugedir)) < 0) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + /* if we have kernel support for reserving hugepages + * through mmap, and we're in in-memory mode, treat this + * page size as valid. we cannot be in legacy mode at + * this point because we've checked this earlier in the + * init process. + */ +#ifdef MAP_HUGE_SHIFT + if (internal_config.in_memory) { + RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } +#endif + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + calc_num_pages(hpi, dirent); + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) { + /* pages may no longer all be on socket 0, so check all */ + unsigned int j, num_pages = 0; + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + num_pages += hpi->num_pages[j]; + if (num_pages > 0) + return 0; + } + + /* no valid hugepage mounts available, return error */ + return -1; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + struct hugepage_info *hpi, *tmp_hpi; + unsigned int i; + + if (hugepage_info_init() < 0) + return -1; + + /* for no shared files mode, we're done */ + if (internal_config.no_shconf) + return 0; + + hpi = &internal_config.hugepage_info[0]; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} + +int eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c new file mode 100644 index 0000000000..cbac451e11 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_interrupts.c @@ -0,0 +1,1326 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ + irq_set->count = intr_handle->max_intr ? + (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? + RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif +#endif + +static int +uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(const struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(const struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + return -EPIPE; + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + return -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + return -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + bool call = false; + int n, bytes_read; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif +#endif + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + /* + * The device is unplugged or buggy, remove + * it as an interrupt source and return to + * force the wait list to be rebuilt. + */ + rte_spinlock_lock(&intr_lock); + TAILQ_REMOVE(&intr_sources, src, next); + rte_spinlock_unlock(&intr_lock); + + for (cb = TAILQ_FIRST(&src->callbacks); cb; + cb = next) { + next = TAILQ_NEXT(cb, next); + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + } + free(src); + return -1; + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __attribute__((noreturn)) void * +eal_intr_thread_main(__rte_unused void *arg) +{ + struct epoll_event ev; + + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) { + rte_errno = errno; + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 0; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + bytes_read = intr_handle->efd_counter_size; + /* For vdev, number of bytes to read is set by driver */ + break; + case RTE_INTR_HANDLE_EXT: + return; + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + if (bytes_read == 0) + return; + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -errno; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { + /* only check, initialization would be done in vdev driver.*/ + if (intr_handle->efd_counter_size > + sizeof(union rte_intr_read_buffer)) { + RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); + return -EINVAL; + } + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + if (intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 1; + + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_lcore.c b/lib/librte_eal/linux/eal/eal_lcore.c new file mode 100644 index 0000000000..bc8965844c --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_lcore.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_log.c b/lib/librte_eal/linux/eal/eal_log.c new file mode 100644 index 0000000000..9d02dddbed --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_log.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +/* + * default log function + */ +static ssize_t +console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + ssize_t ret; + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + eal_log_set_default(log_stream); + + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_memalloc.c b/lib/librte_eal/linux/eal/eal_memalloc.c new file mode 100644 index 0000000000..b6fb183db4 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_memalloc.c @@ -0,0 +1,1685 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif +#include +#include /* for hugetlb-related mmap flags */ + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_private.h" + +const int anonymous_hugepages_supported = +#ifdef MAP_HUGE_SHIFT + 1; +#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT +#else + 0; +#define RTE_MAP_HUGE_SHIFT 26 +#endif + +/* + * we've already checked memfd support at compile-time, but we also need to + * check if we can create hugepage files with memfd. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB + 1; +#define RTE_MFD_HUGETLB MFD_HUGETLB +#else + 0; +#define RTE_MFD_HUGETLB 4U +#endif + +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we need some kind of mechanism to keep track of + * which hugepages can be freed back to the system, and which cannot. we cannot + * use flock() because they don't allow locking parts of a file, and we cannot + * use fcntl() due to issues with their semantics, so we will have to rely on a + * bunch of lockfiles for each page. so, we will use 'fds' array to keep track + * of per-page lockfiles. we will store the actual segment list fd in the + * 'memseg_list_fd' field. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. + * + * we cannot know how many pages a system will have in advance, but we do know + * that they come in lists, and we know lengths of these lists. so, simply store + * a malloc'd array of fd's indexed by list and segment index. + * + * they will be initialized at startup, and filled as we allocate/deallocate + * segments. + */ +static struct { + int *fds; /**< dynamically allocated array of segment lock fd's */ + int memseg_list_fd; /**< memseg list fd */ + int len; /**< total length of the array */ + int count; /**< entries used in an array */ +} fd_list[RTE_MAX_MEMSEG_LISTS]; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + +static sigjmp_buf huge_jmpenv; + +static void __rte_unused huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int __rte_unused huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void __rte_unused +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void __rte_unused +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +static bool +check_numa(void) +{ + bool ret = true; + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + ret = false; + } + return ret; +} + +static void +prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) +{ + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + *oldpolicy = MPOL_DEFAULT; + } + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + socket_id); + numa_set_preferred(socket_id); +} + +static void +restore_numa(int *oldpolicy, struct bitmask *oldmask) +{ + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", *oldpolicy); + if (*oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(*oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + numa_free_cpumask(oldmask); +} +#endif + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = rte_log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + +/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ +static int lock(int fd, int type) +{ + int ret; + + /* flock may be interrupted */ + do { + ret = flock(fd, type | LOCK_NB); + } while (ret && errno == EINTR); + + if (ret && errno == EWOULDBLOCK) { + /* couldn't lock */ + return 0; + } else if (ret) { + RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", + __func__, strerror(errno)); + return -1; + } + /* lock was successful */ + return 1; +} + +static int get_segment_lock_fd(int list_idx, int seg_idx) +{ + char path[PATH_MAX] = {0}; + int fd; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) + return -1; + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) + return -1; + + fd = fd_list[list_idx].fds[seg_idx]; + /* does this lock already exist? */ + if (fd >= 0) + return fd; + + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = open(path, O_CREAT | O_RDWR, 0660); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n", + __func__, path, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) != 1) { + RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n", + __func__, path, strerror(errno)); + close(fd); + return -1; + } + /* store it for future reference */ + fd_list[list_idx].fds[seg_idx] = fd; + fd_list[list_idx].count++; + return fd; +} + +static int unlock_segment(int list_idx, int seg_idx) +{ + int fd, ret; + + if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) + return -1; + if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) + return -1; + + fd = fd_list[list_idx].fds[seg_idx]; + + /* upgrade lock to exclusive to see if we can remove the lockfile */ + ret = lock(fd, LOCK_EX); + if (ret == 1) { + /* we've succeeded in taking exclusive lock, this lockfile may + * be removed. + */ + char path[PATH_MAX] = {0}; + eal_get_hugefile_lock_path(path, sizeof(path), + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + if (unlink(path)) { + RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n", + __func__, path, strerror(errno)); + } + } + /* we don't want to leak the fd, so even if we fail to lock, close fd + * and remove it from list anyway. + */ + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + fd_list[list_idx].count--; + + if (ret < 0) + return -1; + return 0; +} + +static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + +static int +get_seg_fd(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int fd; + + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + + if (internal_config.single_file_segments) { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); + + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock and keep it indefinitely */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +} + +static int +resize_hugefile(int fd, char *path, int list_idx, int seg_idx, + uint64_t fa_offset, uint64_t page_sz, bool grow) +{ + bool again = false; + + /* in-memory mode is a special case, because we don't need to perform + * any locking, and we can be sure that fallocate() is supported. + */ + if (internal_config.in_memory) { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + /* increase/decrease total segment count */ + fd_list[list_idx].count += (grow ? 1 : -1); + if (!grow && fd_list[list_idx].count == 0) { + close(fd_list[list_idx].memseg_list_fd); + fd_list[list_idx].memseg_list_fd = -1; + } + return 0; + } + + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, and hugepage file is already locked at + * creation, so no further synchronization needed. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret, lock_fd; + + /* if fallocate() is supported, we need to take out a + * read lock on allocate (to prevent other processes + * from deallocating this page), and take out a write + * lock on deallocate (to ensure nobody else is using + * this page). + * + * read locks on page itself are already taken out at + * file creation, in get_seg_fd(). + * + * we cannot rely on simple use of flock() call, because + * we need to be able to lock a section of the file, + * and we cannot use fcntl() locks, because of numerous + * problems with their semantics, so we will use + * deterministically named lock files for each section + * of the file. + * + * if we're shrinking the file, we want to upgrade our + * lock from shared to exclusive. + * + * lock_fd is an fd for a lockfile, not for the segment + * list. + */ + lock_fd = get_segment_lock_fd(list_idx, seg_idx); + + if (!grow) { + /* we are using this lockfile to determine + * whether this particular page is locked, as we + * are in single file segments mode and thus + * cannot use regular flock() to get this info. + * + * we want to try and take out an exclusive lock + * on the lock file to determine if we're the + * last ones using this page, and if not, we + * won't be shrinking it, and will instead exit + * prematurely. + */ + ret = lock(lock_fd, LOCK_EX); + + /* drop the lock on the lockfile, so that even + * if we couldn't shrink the file ourselves, we + * are signalling to other processes that we're + * no longer using this page. + */ + if (unlock_segment(list_idx, seg_idx)) + RTE_LOG(ERR, EAL, "Could not unlock segment\n"); + + /* additionally, if this was the last lock on + * this segment list, we can safely close the + * page file fd, so that one of the processes + * could then delete the file after shrinking. + */ + if (ret < 1 && fd_list[list_idx].count == 0) { + close(fd); + fd_list[list_idx].memseg_list_fd = -1; + } + + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not lock segment\n"); + return -1; + } + if (ret == 0) + /* failed to lock, not an error. */ + return 0; + } + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else { + fallocate_supported = 1; + + /* we've grew/shrunk the file, and we hold an + * exclusive lock now. check if there are no + * more segments active in this segment list, + * and remove the file if there aren't. + */ + if (fd_list[list_idx].count == 0) { + if (unlink(path)) + RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", + __func__, path, + strerror(errno)); + close(fd); + fd_list[list_idx].memseg_list_fd = -1; + } + } + } + } while (again); + return 0; +} + +static int +alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, + struct hugepage_info *hi, unsigned int list_idx, + unsigned int seg_idx) +{ +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int cur_socket_id = 0; +#endif + uint64_t map_offset; + rte_iova_t iova; + void *va; + char path[PATH_MAX]; + int ret = 0; + int fd; + size_t alloc_sz; + int flags; + void *new_addr; + + alloc_sz = hi->hugepage_sz; + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | + MAP_PRIVATE | MAP_ANONYMOUS; + int pagesz_flag; + + pagesz_flag = pagesz_flags(alloc_sz); + fd = -1; + mmap_flags = in_memory_flags | pagesz_flag; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. + */ + map_offset = 0; + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; + } + + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, path, list_idx, seg_idx, + map_offset, alloc_sz, true); + if (ret < 0) + goto resized; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + } + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; + } + + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + + if (va == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, + strerror(errno)); + /* mmap failed, but the previous region might have been + * unmapped anyway. try to remap it + */ + goto unmapped; + } + if (va != addr) { + RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); + munmap(va, alloc_sz); + goto resized; + } + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", + (unsigned int)(alloc_sz >> 20)); + goto mapped; + } + + /* we need to trigger a write to the page to enforce page fault and + * ensure that page is accessible to us, but we can't overwrite value + * that is already there, so read the old value, and write itback. + * kernel populates the page with zeroes initially. + */ + *(volatile int *)addr = *(volatile int *)addr; + + iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); + + if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#endif + + ms->addr = addr; + ms->hugepage_sz = alloc_sz; + ms->len = alloc_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->iova = iova; + ms->socket_id = socket_id; + + return 0; + +mapped: + munmap(addr, alloc_sz); +unmapped: + flags = MAP_FIXED; + new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); + if (new_addr != addr) { + if (new_addr != NULL) + munmap(new_addr, alloc_sz); + /* we're leaving a hole in our virtual address space. if + * somebody else maps this hole now, we could accidentally + * override it in the future. + */ + RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); + } +resized: + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + alloc_sz, false); + /* ignore failure, can't make it any worse */ + } else { + /* only remove file if we can take out a write lock */ + if (internal_config.hugepage_unlink == 0 && + internal_config.in_memory == 0 && + lock(fd, LOCK_EX) == 1) + unlink(path); + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + return -1; +} + +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret = 0; + bool exit_early; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + + /* if we've already unlinked the page, nothing needs to be done */ + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { + memset(ms, 0, sizeof(*ms)); + return 0; + } + + /* if we are not in single file segments mode, we're going to unmap the + * segment and thus drop the lock on original fd, but hugepage dir is + * now locked so we can take out another one without races. + */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset, + ms->len, false)) + return -1; + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } + } + /* closing fd will drop the lock */ + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + + memset(ms, 0, sizeof(*ms)); + + return ret < 0 ? -1 : 0; +} + +struct alloc_walk_param { + struct hugepage_info *hi; + struct rte_memseg **ms; + size_t page_sz; + unsigned int segs_allocated; + unsigned int n_segs; + int socket; + bool exact; +}; +static int +alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct alloc_walk_param *wa = arg; + struct rte_memseg_list *cur_msl; + size_t page_sz; + int cur_idx, start_idx, j, dir_fd = -1; + unsigned int msl_idx, need, i; + + if (msl->page_sz != wa->page_sz) + return 0; + if (msl->socket_id != wa->socket) + return 0; + + page_sz = (size_t)msl->page_sz; + + msl_idx = msl - mcfg->memsegs; + cur_msl = &mcfg->memsegs[msl_idx]; + + need = wa->n_segs; + + /* try finding space in memseg list */ + cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + for (i = 0; i < need; i++, cur_idx++) { + struct rte_memseg *cur; + void *map_addr; + + cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); + map_addr = RTE_PTR_ADD(cur_msl->base_va, + cur_idx * page_sz); + + if (alloc_seg(cur, map_addr, wa->socket, wa->hi, + msl_idx, cur_idx)) { + RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", + need, i); + + /* if exact number wasn't requested, stop */ + if (!wa->exact) + goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + rte_fbarray_set_free(arr, j); + + /* free_seg may attempt to create a file, which + * may fail. + */ + if (free_seg(tmp, wa->hi, msl_idx, j)) + RTE_LOG(DEBUG, EAL, "Cannot free page\n"); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); + + if (dir_fd >= 0) + close(dir_fd); + return -1; + } + if (wa->ms) + wa->ms[i] = cur; + + rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); + } +out: + wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; + if (dir_fd >= 0) + close(dir_fd); + return 1; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx, ret, dir_fd = -1; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->len; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + /* do not allow any page allocations during the time we're freeing, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + found_msl->version++; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + + ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); + + if (dir_fd >= 0) + close(dir_fd); + + if (ret < 0) + return -1; + + return 1; +} + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact) +{ + int i, ret = -1; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + bool have_numa = false; + int oldpolicy; + struct bitmask *oldmask; +#endif + struct alloc_walk_param wa; + struct hugepage_info *hi = NULL; + + memset(&wa, 0, sizeof(wa)); + + /* dynamic allocation not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { + if (page_sz == + internal_config.hugepage_info[i].hugepage_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", + __func__); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (check_numa()) { + oldmask = numa_allocate_nodemask(); + prepare_numa(&oldpolicy, oldmask, socket); + have_numa = true; + } +#endif + + wa.exact = exact; + wa.hi = hi; + wa.ms = ms; + wa.n_segs = n_segs; + wa.page_sz = page_sz; + wa.socket = socket; + wa.segs_allocated = 0; + + /* memalloc is locked, so it's safe to use thread-unsafe version */ + ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + if (ret == 0) { + RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", + __func__); + ret = -1; + } else if (ret > 0) { + ret = (int)wa.segs_allocated; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) + restore_numa(&oldpolicy, oldmask); +#endif + return ret; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket) +{ + struct rte_memseg *ms; + if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) + return NULL; + /* return pointer to newly allocated memseg */ + return ms; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + /* memalloc is locked, so it's safe to use thread-unsafe version + */ + walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, + &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +} + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret, dir_fd; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + */ + dir_fd = open(hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + goto fail; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + goto fail; + + /* update version number */ + local_msl->version = primary_msl->version; + + close(dir_fd); + + return 0; +fail: + close(dir_fd); + return -1; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match, synchronize everything */ + if (local_msl->version != primary_msl->version && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + /* memalloc is locked, so it's safe to call thread-unsafe version */ + if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +} + +static int +secondary_msl_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + char name[PATH_MAX]; + int msl_idx, ret; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; + + return 0; +} + +static int +alloc_list(int list_idx, int len) +{ + int *data; + int i; + + /* ensure we have space to store fd per each possible segment */ + data = malloc(sizeof(int) * len); + if (data == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); + return -1; + } + /* set all fd's as invalid */ + for (i = 0; i < len; i++) + data[i] = -1; + + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* single file segments mode doesn't support individual segment fd's */ + if (internal_config.single_file_segments) + return -ENOTSUP; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; + + return 0; +} + +int +eal_memalloc_set_seg_list_fd(int list_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* non-single file segment mode doesn't support segment list fd's */ + if (!internal_config.single_file_segments) + return -ENOTSUP; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + + fd_list[list_idx].memseg_list_fd = fd; + + return 0; +} + +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | RTE_MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + +int +eal_memalloc_init(void) +{ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) + return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); + + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); + return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c new file mode 100644 index 0000000000..1b96b576e0 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_memory.c @@ -0,0 +1,2439 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#define _FILE_OFFSET_BITS 64 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_memalloc.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" + +#define PFN_MASK_SIZE 8 + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static bool phys_addrs_available = true; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +static void +test_phys_addrs_available(void) +{ + uint64_t tmp = 0; + phys_addr_t physaddr; + + if (!rte_eal_has_hugepages()) { + RTE_LOG(ERR, EAL, + "Started without hugepages support, physical addresses not available\n"); + phys_addrs_available = false; + return; + } + + physaddr = rte_mem_virt2phy(&tmp); + if (physaddr == RTE_BAD_PHYS_ADDR) { + if (rte_eal_iova_mode() == RTE_IOVA_PA) + RTE_LOG(ERR, EAL, + "Cannot obtain physical addresses: %s. " + "Only vfio will function.\n", + strerror(errno)); + phys_addrs_available = false; + } +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd, retval; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ + if (!phys_addrs_available) + return RTE_BAD_IOVA; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_IOVA; + } + + retval = read(fd, &page, PFN_MASK_SIZE); + close(fd); + if (retval < 0) { + RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } else if (retval != PFN_MASK_SIZE) { + RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap " + "but expected %d:\n", + __func__, retval, PFN_MASK_SIZE); + return RTE_BAD_IOVA; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + if ((page & 0x7fffffffffffffULL) == 0) + return RTE_BAD_IOVA; + + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + + return physaddr; +} + +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t)virtaddr; + return rte_mem_virt2phy(virtaddr); +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. + */ +static int +set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + static phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + hugepg_tbl[i].physaddr = addr; + addr += hugepg_tbl[i].size; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +/* Callback for numa library. */ +void numa_error(char *where) +{ + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); +} +#endif + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map contiguous physical blocks in contiguous virtual blocks. + */ +static unsigned +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, + uint64_t *essential_memory __rte_unused) +{ + int fd; + unsigned i; + void *virtaddr; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int node_id = -1; + int essential_prev = 0; + int oldpolicy; + struct bitmask *oldmask = NULL; + bool have_numa = true; + unsigned long maxnode = 0; + + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + have_numa = false; + } + + if (have_numa) { + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + if (internal_config.socket_mem[i]) + maxnode = i + 1; + } +#endif + + for (i = 0; i < hpi->num_pages[0]; i++) { + struct hugepage_file *hf = &hugepg_tbl[i]; + uint64_t hugepage_sz = hpi->hugepage_sz; + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + unsigned int j; + + for (j = 0; j < maxnode; j++) + if (essential_memory[j]) + break; + + if (j == maxnode) { + node_id = (node_id + 1) % maxnode; + while (!internal_config.socket_mem[node_id]) { + node_id++; + node_id %= maxnode; + } + essential_prev = 0; + } else { + node_id = j; + essential_prev = essential_memory[j]; + + if (essential_memory[j] < hugepage_sz) + essential_memory[j] = 0; + else + essential_memory[j] -= hugepage_sz; + } + + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + node_id); + numa_set_preferred(node_id); + } +#endif + + hf->file_id = i; + hf->size = hugepage_sz; + eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), + hpi->hugedir, hf->file_id); + hf->filepath[sizeof(hf->filepath) - 1] = '\0'; + + /* try to create hugepage file */ + fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + goto out; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros. we don't care where + * this gets mapped - we already have contiguous memory areas + * ready for us to map into. + */ + virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + goto out; + } + + hf->orig_va = virtaddr; + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned int)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) + essential_memory[node_id] = + essential_prev; +#endif + goto out; + } + *(int *)virtaddr = 0; + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + goto out; + } + + close(fd); + } + +out: +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", oldpolicy); + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + } + if (oldmask != NULL) + numa_free_cpumask(oldmask); +#endif + return i; +} + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "NUMA support not available" + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, eal_get_hugefile_prefix()); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + RTE_LOG(DEBUG, EAL, + "Hugepage %s is on socket %d\n", + hugepg_tbl[i].filepath, socket_id); +#endif + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = a; + const struct hugepage_file *p2 = b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = b; + const struct hugepage_file *p2 = a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd; + + /* if no shared files mode is used, create anonymous memory instead */ + if (internal_config.no_shconf) { + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (retval == MAP_FAILED) + return NULL; + return retval; + } + + fd = open(filename, O_CREAT | O_RDWR, 0666); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (retval == MAP_FAILED) + return NULL; + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].orig_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->orig_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + + unmap_len = hp->size; + + /* get start addr and len of the remaining segment */ + munmap(hp->orig_va, + (size_t)unmap_len); + + hp->orig_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } else { + /* lock the page and skip */ + pages_found++; + } + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static int +remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int cur_page, seg_len; + unsigned int msl_idx; + int ms_idx; + uint64_t page_sz; + size_t memseg_len; + int socket_id; + + page_sz = hugepages[seg_start].size; + socket_id = hugepages[seg_start].socket_id; + seg_len = seg_end - seg_start; + + RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20ULL, socket_id); + + /* find free space in memseg lists */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + bool empty; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + if (msl->socket_id != socket_id) + continue; + + /* leave space for a hole if array is not empty */ + empty = arr->count == 0; + ms_idx = rte_fbarray_find_next_n_free(arr, 0, + seg_len + (empty ? 0 : 1)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + /* leave some space between memsegs, they are not IOVA + * contiguous, so they shouldn't be VA contiguous either. + */ + if (!empty) + ms_idx++; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + +#ifdef RTE_ARCH_PPC64 + /* for PPC64 we go through the list backwards */ + for (cur_page = seg_end - 1; cur_page >= seg_start; + cur_page--, ms_idx++) { +#else + for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { +#endif + struct hugepage_file *hfile = &hugepages[cur_page]; + struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); + void *addr; + int fd; + + fd = open(hfile->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", + hfile->filepath, strerror(errno)); + return -1; + } + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + memseg_len = (size_t)page_sz; + addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); + + /* we know this address is already mmapped by memseg list, so + * using MAP_FIXED here is safe + */ + addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + + /* we have a new address, so unmap previous one */ +#ifndef RTE_ARCH_64 + /* in 32-bit legacy mode, we have already unmapped the page */ + if (!internal_config.legacy_mem) + munmap(hfile->orig_va, page_sz); +#else + munmap(hfile->orig_va, page_sz); +#endif + + hfile->orig_va = NULL; + hfile->final_va = addr; + + /* rewrite physical addresses in IOVA as VA mode */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + hfile->physaddr = (uintptr_t)addr; + + /* set up memseg data */ + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = memseg_len; + ms->iova = hfile->physaddr; + ms->socket_id = hfile->socket_id; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + + rte_fbarray_set_used(arr, ms_idx); + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20, socket_id); + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + msl->len = mem_sz; + + return 0; +} + +/* + * Our VA space is not preallocated yet, so preallocate it here. We need to know + * how many segments there are in order to map all pages into one address space, + * and leave appropriate holes between segments so that rte_malloc does not + * concatenate them into one big segment. + * + * we also need to unmap original pages to free up address space. + */ +static int __rte_unused +prealloc_segments(struct hugepage_file *hugepages, int n_pages) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int cur_page, seg_start_page, end_seg, new_memseg; + unsigned int hpi_idx, socket, i; + int n_contig_segs, n_segs; + int msl_idx; + + /* before we preallocate segments, we need to free up our VA space. + * we're not removing files, and we already have information about + * PA-contiguousness, so it is safe to unmap everything. + */ + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *hpi = &hugepages[cur_page]; + munmap(hpi->orig_va, hpi->size); + hpi->orig_va = NULL; + } + + /* we cannot know how many page sizes and sockets we have discovered, so + * loop over all of them + */ + for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t page_sz = + internal_config.hugepage_info[hpi_idx].hugepage_sz; + + for (i = 0; i < rte_socket_count(); i++) { + struct rte_memseg_list *msl; + + socket = rte_socket_id_by_idx(i); + n_contig_segs = 0; + n_segs = 0; + seg_start_page = -1; + + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + int prev_seg_start_page = -1; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : + &hugepages[cur_page - 1]; + + new_memseg = 0; + end_seg = 0; + + if (cur->size == 0) + end_seg = 1; + else if (cur->socket_id != (int) socket) + end_seg = 1; + else if (cur->size != page_sz) + end_seg = 1; + else if (cur_page == 0) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start + * from higher address to lower address. Here, + * physical addresses are in descending order. + */ + else if ((prev->physaddr - cur->physaddr) != + cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != + cur->size) + new_memseg = 1; +#endif + if (new_memseg) { + /* if we're already inside a segment, + * new segment means end of current one + */ + if (seg_start_page != -1) { + end_seg = 1; + prev_seg_start_page = + seg_start_page; + } + seg_start_page = cur_page; + } + + if (end_seg) { + if (prev_seg_start_page != -1) { + /* we've found a new segment */ + n_contig_segs++; + n_segs += cur_page - + prev_seg_start_page; + } else if (seg_start_page != -1) { + /* we didn't find new segment, + * but did end current one + */ + n_contig_segs++; + n_segs += cur_page - + seg_start_page; + seg_start_page = -1; + continue; + } else { + /* we're skipping this page */ + continue; + } + } + /* segment continues */ + } + /* check if we missed last segment */ + if (seg_start_page != -1) { + n_contig_segs++; + n_segs += cur_page - seg_start_page; + } + + /* if no segments were found, do not preallocate */ + if (n_segs == 0) + continue; + + /* we now have total number of pages that we will + * allocate for this segment list. add separator pages + * to the total count, and preallocate VA space. + */ + n_segs += n_contig_segs - 1; + + /* now, preallocate VA space for these segments */ + + /* first, find suitable memseg list for this */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + msl = &mcfg->memsegs[msl_idx]; + + if (msl->base_va != NULL) + continue; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + /* now, allocate fbarray itself */ + if (alloc_memseg_list(msl, page_sz, n_segs, socket, + msl_idx) < 0) + return -1; + + /* finally, allocate VA space */ + if (alloc_va_space(msl) < 0) + return -1; + } + } + return 0; +} + +/* + * We cannot reallocate memseg lists on the fly because PPC64 stores pages + * backwards, therefore we have to process the entire memseg first before + * remapping it into memseg list VA space. + */ +static int +remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) +{ + int cur_page, seg_start_page, new_memseg, ret; + + seg_start_page = 0; + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + + new_memseg = 0; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; + + /* if size is zero, no more pages left */ + if (cur->size == 0) + break; + + if (cur_page == 0) + new_memseg = 1; + else if (cur->socket_id != prev->socket_id) + new_memseg = 1; + else if (cur->size != prev->size) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * address to lower address. Here, physical addresses are in + * descending order. + */ + else if ((prev->physaddr - cur->physaddr) != cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != cur->size) + new_memseg = 1; +#endif + + if (new_memseg) { + /* if this isn't the first time, remap segment */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + /* remember where we started */ + seg_start_page = cur_page; + } + /* continuation of previous memseg */ + } + /* we were stopped, but we didn't remap the last segment, do it now */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + return 0; +} + +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + size_t total_size; +#ifdef RTE_ARCH_64 + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } +#else + /* in 32-bit mode, allocate all of the memory only on master + * lcore socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; + socket++) { + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int master_lcore_socket; + + master_lcore_socket = + rte_lcore_to_socket_id(cfg->master_lcore); + + if (master_lcore_socket != socket) + continue; + + /* Update sizes */ + memory[socket] = total_size; + break; + } +#endif + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, + sizeof(hp_used[i].hugedir)); + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0 && + internal_config.socket_mem[socket] != 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +static int +eal_legacy_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + struct rte_fbarray *arr; + struct rte_memseg *ms; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j; + int nr_hugefiles, nr_hugepages = 0; + void *addr; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + int n_segs, cur_seg, fd, flags; +#ifdef MEMFD_SUPPORTED + int memfd; +#endif + uint64_t page_sz; + + /* nohuge mode is legacy mode */ + internal_config.legacy_mem = 1; + + /* nohuge mode is single-file segments mode */ + internal_config.single_file_segments = 1; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + /* set up parameters for anonymous mmap */ + fd = -1; + flags = MAP_PRIVATE | MAP_ANONYMOUS; + +#ifdef MEMFD_SUPPORTED + /* create a memfd and store it in the segment fd table */ + memfd = memfd_create("nohuge", 0); + if (memfd < 0) { + RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n", + strerror(errno)); + RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n"); + } else { + /* we got an fd - now resize it */ + if (ftruncate(memfd, internal_config.memory) < 0) { + RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n", + strerror(errno)); + RTE_LOG(ERR, EAL, "Falling back to anonymous map\n"); + close(memfd); + } else { + /* creating memfd-backed file was successful. + * we want changes to memfd to be visible to + * other processes (such as vhost backend), so + * map it as shared memory. + */ + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + fd = memfd; + flags = MAP_SHARED; + } + } +#endif + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, + flags, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + msl->len = internal_config.memory; + + /* we're in single-file segments mode, so only the segment list + * fd needs to be set up. + */ + if (fd != -1) { + if (eal_memalloc_set_seg_list_fd(0, fd) < 0) { + RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n"); + /* not a serious error, proceed */ + } + } + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->socket_id = 0; + ms->len = page_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)page_sz); + } + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldnt allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + if (rte_eal_iova_mode() == RTE_IOVA_VA && + rte_eal_using_phys_addrs()) + RTE_LOG(ERR, EAL, + "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n", + __func__); + goto fail; + } + return 0; + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + huge_register_sigbus(); + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } + + if (phys_addrs_available && + rte_eal_iova_mode() != RTE_IOVA_VA) { + /* find physical addresses for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to find phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } else { + /* set physical addresses for each hugepage */ + if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to set phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; + } + + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + nr_hugefiles = nr_hugepages; + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_data_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have orig_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + +#ifndef RTE_ARCH_64 + /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ + if (internal_config.legacy_mem && + prealloc_segments(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); + goto fail; + } +#endif + + /* remap all pages we do need into memseg list VA space, so that those + * pages become first-class citizens in DPDK memory subsystem + */ + if (remap_needed_hugepages(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + hugepage = NULL; + + /* we're not going to allocate more pages, so release VA space for + * unused memseg lists + */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + size_t mem_sz; + + /* skip inactive lists */ + if (msl->base_va == NULL) + continue; + /* skip lists where there is at least one page allocated */ + if (msl->memseg_arr.count > 0) + continue; + /* this is an unused list, deallocate it */ + mem_sz = msl->len; + munmap(msl->base_va, mem_sz); + msl->base_va = NULL; + + /* destroy backing fbarray */ + rte_fbarray_destroy(&msl->memseg_arr); + } + + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + goto fail; + } + + return 0; + +fail: + huge_recover_sigbus(); + free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return -1; +} + +static int __rte_unused +hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct hugepage_info *hpi = arg; + + if (msl->page_sz != hpi->hugepage_sz) + return 0; + + hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; + return 0; +} + +static int +limits_callback(int socket_id, size_t cur_limit, size_t new_len) +{ + RTE_SET_USED(socket_id); + RTE_SET_USED(cur_limit); + RTE_SET_USED(new_len); + return -1; +} + +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + test_phys_addrs_available(); + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { +#ifndef RTE_ARCH_64 + struct hugepage_info dummy; + unsigned int i; +#endif + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + +#ifndef RTE_ARCH_64 + /* for 32-bit, limit number of pages on socket to whatever we've + * preallocated, as we cannot allocate more. + */ + memset(&dummy, 0, sizeof(dummy)); + dummy.hugepage_sz = hpi->hugepage_sz; + if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) + return -1; + + for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { + hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], + dummy.num_pages[i]); + } +#endif + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int)internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct rte_memseg **pages; + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + int num_pages_alloc, i; + + if (num_pages == 0) + continue; + + pages = malloc(sizeof(*pages) * num_pages); + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages, + num_pages, hpi->hugepage_sz, + socket_id, true); + if (num_pages_alloc < 0) { + free(pages); + return -1; + } + + /* mark preallocated pages as unfreeable */ + for (i = 0; i < num_pages_alloc; i++) { + struct rte_memseg *ms = pages[i]; + ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + } + free(pages); + } + } + /* if socket limits were specified, set them */ + if (internal_config.force_socket_limits) { + unsigned int i; + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + uint64_t limit = internal_config.socket_limit[i]; + if (limit == 0) + continue; + if (rte_mem_alloc_validator_register("socket-limit", + limits_callback, i, limit)) + RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); + } + } + return 0; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +static int +eal_legacy_hugepage_attach(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct hugepage_file *hp = NULL; + unsigned int num_hp = 0; + unsigned int i = 0; + unsigned int cur_seg; + off_t size = 0; + int fd, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + test_phys_addrs_available(); + + fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + eal_hugepage_data_path()); + goto error; + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + eal_hugepage_data_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + /* map all segments into memory to make sure we get the addrs. the + * segments themselves are already in memseg list (which is shared and + * has its VA space already preallocated), so we just need to map + * everything into correct addresses. + */ + for (i = 0; i < num_hp; i++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; + + /* if size is zero, no more pages left */ + if (map_sz == 0) + break; + + fd = open(hf->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + hf->filepath, strerror(errno)); + goto error; + } + + map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not map %s: %s\n", + hf->filepath, strerror(errno)); + goto fd_error; + } + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); + goto fd_error; + } + + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto fd_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto fd_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto fd_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + /* unmap the hugepage config file, since we are done using it */ + munmap(hp, size); + close(fd_hugepage); + return 0; + +fd_error: + close(fd); +error: + /* map all segments into memory to make sure we get the addrs */ + cur_seg = 0; + for (cur_seg = 0; cur_seg < i; cur_seg++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + + munmap(map_addr, map_sz); + } + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +static int +eal_hugepage_attach(void) +{ + if (eal_memalloc_sync_with_primary()) { + RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); + if (aslr_enabled() > 0) + RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); + return -1; + } + return 0; +} + +int +rte_eal_hugepage_init(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); +} + +int +rte_eal_hugepage_attach(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_attach() : + eal_hugepage_attach(); +} + +int +rte_eal_using_phys_addrs(void) +{ + return phys_addrs_available; +} + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + uint64_t cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + cur_mem = get_mem_amount(hugepage_sz, + max_pagesz_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ + struct rte_memseg_list *msl; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ + + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (socket_id > 0) + break; +#endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; + + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } + /* number of memtypes could have been lower due to no NUMA support */ + n_memtypes = cur_type; + + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; + + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; + + pagesz = type->page_sz; + socket_id = type->socket_id; + + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ + + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; + + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; + } + } + } + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } + + return rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); +} diff --git a/lib/librte_eal/linux/eal/eal_thread.c b/lib/librte_eal/linux/eal/eal_thread.c new file mode 100644 index 0000000000..379773b683 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_thread.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = ENOSYS; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return -ret; +} diff --git a/lib/librte_eal/linux/eal/eal_timer.c b/lib/librte_eal/linux/eal/eal_timer.c new file mode 100644 index 0000000000..bc8f051990 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_timer.c @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2012-2013 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunately, we need + * this because hpet is 32 bits by default under linux). + */ +static void * +hpet_msb_inc(__attribute__((unused)) void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } + return NULL; +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, + hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +static void +check_tsc_flags(void) +{ + char line[512]; + FILE *stream; + + stream = fopen("/proc/cpuinfo", "r"); + if (!stream) { + RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n"); + return; + } + + while (fgets(line, sizeof line, stream)) { + char *constant_tsc; + char *nonstop_tsc; + + if (strncmp(line, "flags", 5) != 0) + continue; + + constant_tsc = strstr(line, "constant_tsc"); + nonstop_tsc = strstr(line, "nonstop_tsc"); + if (!constant_tsc || !nonstop_tsc) + RTE_LOG(WARNING, EAL, + "WARNING: cpu flags " + "constant_tsc=%s " + "nonstop_tsc=%s " + "-> using unreliable clock cycles !\n", + constant_tsc ? "yes":"no", + nonstop_tsc ? "yes":"no"); + break; + } + + fclose(stream); +} + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + return tsc_hz; + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + check_tsc_flags(); + return 0; +} diff --git a/lib/librte_eal/linux/eal/eal_vfio.c b/lib/librte_eal/linux/eal/eal_vfio.c new file mode 100644 index 0000000000..c821e83826 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_vfio.c @@ -0,0 +1,2049 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" + +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; + +struct user_mem_maps { + rte_spinlock_recursive_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + const struct vfio_iommu_type *vfio_iommu_type; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; + struct user_mem_maps mem_maps; +}; + +/* per-process VFIO config */ +static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; +static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; + +static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, + uint64_t iova, uint64_t len, int do_map); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, + /* ppc64 IOMMU, otherwise known as spapr */ + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, + /* IOMMU-less mode */ + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, +}; + +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) + return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; + } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, + uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = &user_mem_maps->maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if VA end is within boundaries */ + if (va_end <= map->addr || va_end > map_va_end) + continue; + + /* check start IOVA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end > map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(struct user_mem_maps *user_mem_maps) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps->maps, user_mem_maps->n_maps, + sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps->maps[i]; + r = &user_mem_maps->maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps->n_maps; i++) { + if (!is_null_map(&user_mem_maps->maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps->maps[i]; + dst = &user_mem_maps->maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps->n_maps = cur_idx; + } +} + +static int +vfio_open_group_fd(int iommu_group_num) +{ + int vfio_group_fd; + char filename[PATH_MAX]; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, + iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via mp channel. + */ + p->req = SOCKET_REQ_GROUP; + p->group_num = iommu_group_num; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_group_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_group_fd = mp_rep->fds[0]; + } else if (p->result == SOCKET_NO_FD) { + RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); + vfio_group_fd = 0; + } + free(mp_reply.msgs); + } + + if (vfio_group_fd < 0) + RTE_LOG(ERR, EAL, " cannot request group fd\n"); + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_num(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + if (vfio_cfg->vfio_groups[j].group_num == + iommu_group_num) + return vfio_cfg; + } + } + + return NULL; +} + +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) +{ + int i; + int vfio_group_fd; + struct vfio_group *cur_grp; + + /* check if we already have the group descriptor open */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) + return vfio_cfg->vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return j; + } + + return -1; +} + +static void +vfio_group_device_get(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices++; +} + +static void +vfio_group_device_put(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices--; +} + +static int +vfio_group_device_count(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + return -1; + } + + return vfio_cfg->vfio_groups[i].devices; +} + +static void +vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + void *arg __rte_unused) +{ + struct rte_memseg_list *msl; + struct rte_memseg *ms; + size_t cur_len = 0; + + msl = rte_mem_virt2memseg_list(addr); + + /* for IOVA as VA mode, no need to care for IOVA addresses */ + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { + uint64_t vfio_va = (uint64_t)(uintptr_t)addr; + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 0); + return; + } + + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + goto next; + } + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, + ms->iova, ms->len, 0); +next: + cur_len += ms->len; + ++ms; + } +} + +static int +vfio_sync_default_container(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + int iommu_type_id; + unsigned int i; + + /* cannot be called from primary */ + if (rte_eal_process_type() != RTE_PROC_SECONDARY) + return -1; + + /* default container fd should have been opened in rte_vfio_enable() */ + if (!default_vfio_cfg->vfio_enabled || + default_vfio_cfg->vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, "VFIO support is not initialized\n"); + return -1; + } + + /* find default container's IOMMU type */ + p->req = SOCKET_REQ_IOMMU_TYPE; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + iommu_type_id = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK) + iommu_type_id = p->iommu_type_id; + free(mp_reply.msgs); + } + if (iommu_type_id < 0) { + RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n"); + return -1; + } + + /* we now have an fd for default container, as well as its IOMMU type. + * now, set up default VFIO container config to match. + */ + for (i = 0; i < RTE_DIM(iommu_types); i++) { + const struct vfio_iommu_type *t = &iommu_types[i]; + if (t->type_id != iommu_type_id) + continue; + + /* we found our IOMMU type */ + default_vfio_cfg->vfio_iommu_type = t; + + return 0; + } + RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n", + iommu_type_id); + return -1; +} + +int +rte_vfio_clear_group(int vfio_group_fd) +{ + int i; + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int vfio_container_fd; + int vfio_group_fd; + int iommu_group_num; + int i, ret; + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd < 0) + return -1; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + vfio_container_fd = vfio_cfg->vfio_container_fd; + user_mem_maps = &vfio_cfg->mem_maps; + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when first group is + * assigned to a container and only in primary process. + * Note this can happen several times with the hotplug + * functionality. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg->vfio_active_groups == 1 && + vfio_group_device_count(vfio_group_fd) == 0) { + const struct vfio_iommu_type *t; + + /* select an IOMMU type which we will be using */ + t = vfio_set_iommu_type(vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, + " %s failed to select IOMMU type\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* lock memory hotplug before mapping and release it + * after registering callback, to prevent races + */ + rte_rwlock_read_lock(mem_lock); + if (vfio_cfg == default_vfio_cfg) + ret = t->dma_map_func(vfio_container_fd); + else + ret = 0; + if (ret) { + RTE_LOG(ERR, EAL, + " %s DMA remapping failed, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + rte_rwlock_read_unlock(mem_lock); + return -1; + } + + vfio_cfg->vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps->maps[i]; + + ret = t->dma_user_map_func( + vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_recursive_unlock( + &user_mem_maps->lock); + rte_rwlock_read_unlock(mem_lock); + return -1; + } + } + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + + /* register callback for mem events */ + if (vfio_cfg == default_vfio_cfg) + ret = rte_mem_event_callback_register( + VFIO_MEM_EVENT_CLB_NAME, + vfio_mem_event_callback, NULL); + else + ret = 0; + /* unlock memory hotplug */ + rte_rwlock_read_unlock(mem_lock); + + if (ret && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); + return -1; + } + if (ret) + RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); + else + RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); + } + } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && + vfio_cfg == default_vfio_cfg && + vfio_cfg->vfio_iommu_type == NULL) { + /* if we're not a primary process, we do not set up the VFIO + * container because it's already been set up by the primary + * process. instead, we simply ask the primary about VFIO type + * we are using, and set the VFIO config up appropriately. + */ + ret = vfio_sync_default_container(); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n"); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* we have successfully initialized VFIO, notify user */ + const struct vfio_iommu_type *t = + default_vfio_cfg->vfio_iommu_type; + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + + RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, + strerror(errno)); + close(*vfio_dev_fd); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + vfio_group_device_get(vfio_group_fd); + + return 0; +} + +int +rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, + int vfio_dev_fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + int vfio_group_fd; + int iommu_group_num; + int ret; + + /* we don't want any DMA mapping messages to come while we're detaching + * VFIO device, because this might be the last device and we might need + * to unregister the callback. + */ + rte_rwlock_read_lock(mem_lock); + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret <= 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", + dev_addr); + /* This is an error at this point. */ + ret = -1; + goto out; + } + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd <= 0) { + RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* At this point we got an active group. Closing it will make the + * container detachment. If this is the last active group, VFIO kernel + * code will unset the container and the IOMMU mappings. + */ + + /* Closing a device */ + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* An VFIO group can have several devices attached. Just when there is + * no devices remaining should the group be closed. + */ + vfio_group_device_put(vfio_group_fd); + if (!vfio_group_device_count(vfio_group_fd)) { + + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + if (rte_vfio_clear_group(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", + dev_addr); + ret = -1; + goto out; + } + } + + /* if there are no active device groups, unregister the callback to + * avoid spurious attempts to map/unmap memory from VFIO. + */ + if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && + rte_eal_process_type() != RTE_PROC_SECONDARY) + rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, + NULL); + + /* success */ + ret = 0; + +out: + rte_rwlock_read_unlock(mem_lock); + return ret; +} + +int +rte_vfio_enable(const char *modname) +{ + /* initialize group list */ + int i, j; + int vfio_available; + + rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfgs[i].vfio_container_fd = -1; + vfio_cfgs[i].vfio_active_groups = 0; + vfio_cfgs[i].vfio_iommu_type = NULL; + vfio_cfgs[i].mem_maps.lock = lock; + + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + vfio_cfgs[i].vfio_groups[j].fd = -1; + vfio_cfgs[i].vfio_groups[j].group_num = -1; + vfio_cfgs[i].vfio_groups[j].devices = 0; + } + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } + + /* check if we have VFIO driver enabled */ + if (default_vfio_cfg->vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + default_vfio_cfg->vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +rte_vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname) > 0; + return default_vfio_cfg->vfio_enabled && mod_available; +} + +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + +int +vfio_get_iommu_type(void) +{ + if (default_vfio_cfg->vfio_iommu_type == NULL) + return -1; + + return default_vfio_cfg->vfio_iommu_type->type_id; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) +{ + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) +{ + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +rte_vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } + /* + * if we're in a secondary process, request container fd from the + * primary process via mp channel + */ + p->req = SOCKET_REQ_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_container_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; + free(mp_reply.msgs); + return vfio_container_fd; + } + free(mp_reply.msgs); + } + + RTE_LOG(ERR, EAL, " cannot request container fd\n"); + return -1; +} + +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_num = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + int *vfio_container_fd = arg; + + if (msl->external) + return 0; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + } else { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + if (msl->external) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + if (msl->external) + return 0; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} + +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + }; + struct vfio_iommu_spapr_tce_info info = { + .argsz = sizeof(info), + }; + int ret; + + /* query spapr iommu info */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + RTE_LOG(ERR, EAL, " cannot get iommu info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* remove default DMA of 32 bit window */ + remove.start_addr = info.dma32_window_start; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + RTE_LOG(ERR, EAL, " cannot remove default DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); + if (ret) { + RTE_LOG(ERR, EAL, " cannot create new DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + if (create->start_addr != 0) { + RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); + return -1; + } + + return 0; +} + +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int i, ret = 0; + + vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid container fd!\n"); + return -1; + } + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); + + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } + + /* also check user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + uint64_t max = user_mem_maps->maps[i].iova + + user_mem_maps->maps[i].len; + create.window_size = RTE_MAX(create.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + void *addr; + /* re-create window and remap the entire memory */ + if (iova > create.window_size) { + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + + /* now that we've remapped all of the memory that was present + * before, map the segment that we were requested to map. + * + * however, if we were called by the callback, the memory we + * were called with was already in the memseg list, so previous + * mapping should've mapped that segment already. + * + * virt2memseg_list is a relatively cheap check, so use that. if + * memory is within any memseg list, it's a memseg, so it's + * already mapped. + */ + addr = (void *)(uintptr_t)vaddr; + if (rte_mem_virt2memseg_list(addr) == NULL && + vfio_spapr_dma_do_map(vfio_container_fd, + vaddr, iova, len, 1) < 0) { + RTE_LOG(ERR, EAL, "Could not map segment\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; + } + + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); + } +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, + len, do_map); +} + +static int +container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *new_map; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(user_mem_maps); +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* find our mapping */ + map = find_user_mem_map(user_mem_maps, vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(user_mem_maps); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps->n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(user_mem_maps); + user_mem_maps->n_maps--; + } + +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +int +rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_map(default_vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) +{ + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + return container_dma_unmap(default_vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + int fd; + ssize_t cnt; + char c; + + fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); + if (fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", + errno, strerror(errno)); + return -1; + } + /* + * else the file does not exists + * i.e. noiommu is not enabled + */ + return 0; + } + + cnt = read(fd, &c, 1); + close(fd); + if (cnt != 1) { + RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " + "file %i (%s)\n", errno, strerror(errno)); + return -1; + } + + return c == 'Y'; +} + +int +rte_vfio_container_create(void) +{ + int i; + + /* Find an empty slot to store new vfio config */ + for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == -1) + break; + } + + if (i == VFIO_MAX_CONTAINERS) { + RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); + return -1; + } + + vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); + if (vfio_cfgs[i].vfio_container_fd < 0) { + RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); + return -1; + } + + return vfio_cfgs[i].vfio_container_fd; +} + +int __rte_experimental +rte_vfio_container_destroy(int container_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num != -1) + rte_vfio_container_group_unbind(container_fd, + vfio_cfg->vfio_groups[i].group_num); + + close(container_fd); + vfio_cfg->vfio_container_fd = -1; + vfio_cfg->vfio_active_groups = 0; + vfio_cfg->vfio_iommu_type = NULL; + + return 0; +} + +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp = NULL; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { + RTE_LOG(ERR, EAL, "Specified group number not found\n"); + return -1; + } + + if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { + RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" + " iommu_group_num %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = -1; + cur_grp->fd = -1; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_map(vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_unmap(vfio_cfg, vaddr, iova, len); +} + +#else + +int +rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int +rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, __rte_unused int fd) +{ + return -1; +} + +int +rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + return -1; +} + +int +rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return -1; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/lib/librte_eal/linux/eal/eal_vfio.h b/lib/librte_eal/linux/eal/eal_vfio.h new file mode 100644 index 0000000000..cb2d35fb12 --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_vfio.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +#include + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#else +#pragma message("VFIO configured but not supported by this kernel, disabling.") +#endif /* kernel version >= 3.6.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include +#include + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#ifndef VFIO_SPAPR_TCE_v2_IOMMU +#define RTE_VFIO_SPAPR 7 +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +struct vfio_iommu_spapr_register_memory { + uint32_t argsz; + uint32_t flags; + uint64_t vaddr; + uint64_t size; +}; + +struct vfio_iommu_spapr_tce_create { + uint32_t argsz; + uint32_t flags; + /* in */ + uint32_t page_shift; + uint32_t __resv1; + uint64_t window_size; + uint32_t levels; + uint32_t __resv2; + /* out */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_remove { + uint32_t argsz; + uint32_t flags; + /* in */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_ddw_info { + uint64_t pgsizes; + uint32_t max_dynamic_windows_supported; + uint32_t levels; +}; + +/* SPAPR_v2 is not present, but SPAPR might be */ +#ifndef VFIO_SPAPR_TCE_IOMMU +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct vfio_iommu_spapr_tce_info { + uint32_t argsz; + uint32_t flags; + uint32_t dma32_window_start; + uint32_t dma32_window_size; + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; +#endif /* VFIO_SPAPR_TCE_IOMMU */ + +#else /* VFIO_SPAPR_TCE_v2_IOMMU */ +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU +#endif + +#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS +#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_num; + int fd; + int devices; +}; + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +/* Custom memory region DMA mapping function prototype. + * Takes VFIO container fd, virtual address, phisical address, length and + * operation type (0 to unmap 1 for map) as a parameters. + * Returns 0 on success, -1 on error. + **/ +typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_user_func_t dma_user_map_func; + vfio_dma_func_t dma_map_func; +}; + +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +int +vfio_get_iommu_type(void); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +int vfio_mp_sync_setup(void); + +#define EAL_VFIO_MP "eal_vfio_mp_sync" + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 +#define SOCKET_REQ_IOMMU_TYPE 0x800 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +struct vfio_mp_param { + int req; + int result; + RTE_STD_C11 + union { + int group_num; + int iommu_type_id; + }; +}; + +#endif /* VFIO_PRESENT */ + +#endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c new file mode 100644 index 0000000000..2a47f29d5a --- /dev/null +++ b/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include + +#include "eal_vfio.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +static int +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) +{ + int fd = -1; + int ret; + struct rte_mp_msg reply; + struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; + const struct vfio_mp_param *m = + (const struct vfio_mp_param *)msg->param; + + if (msg->len_param != sizeof(*m)) { + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + switch (m->req) { + case SOCKET_REQ_GROUP: + r->req = SOCKET_REQ_GROUP; + r->group_num = m->group_num; + fd = rte_vfio_get_group_fd(m->group_num); + if (fd < 0) + r->result = SOCKET_ERR; + else if (fd == 0) + /* if VFIO group exists but isn't bound to VFIO driver */ + r->result = SOCKET_NO_FD; + else { + /* if group exists and is bound to VFIO driver */ + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_CONTAINER: + r->req = SOCKET_REQ_CONTAINER; + fd = rte_vfio_get_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_IOMMU_TYPE: + { + int iommu_type_id; + + r->req = SOCKET_REQ_IOMMU_TYPE; + + iommu_type_id = vfio_get_iommu_type(); + + if (iommu_type_id < 0) + r->result = SOCKET_ERR; + else { + r->iommu_type_id = iommu_type_id; + r->result = SOCKET_OK; + } + break; + } + default: + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + strcpy(reply.name, EAL_VFIO_MP); + reply.len_param = sizeof(*r); + + ret = rte_mp_reply(&reply, peer); + if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) + close(fd); + return ret; +} + +int +vfio_mp_sync_setup(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); + + return 0; +} + +#endif diff --git a/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h new file mode 100644 index 0000000000..5afa087131 --- /dev/null +++ b/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ +/* + * Copyright(c) 2007-2014 Intel Corporation. + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include +#include +#define RTE_STD_C11 +#else +#include +#include +#endif + +/** + * KNI name is part of memzone name. + */ +#define RTE_KNI_NAMESIZE 32 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_CHANGE_MAC_ADDR, + RTE_KNI_REQ_CHANGE_PROMISC, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + RTE_STD_C11 + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + uint8_t mac_addr[6]; /**< MAC address for interface */ + uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ +#endif + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void *volatile buffer[]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + uint64_t buf_physaddr; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[2]; + uint16_t nb_segs; /**< Number of segments. */ + char pad4[2]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + /* PCI info */ + uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ + uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ + uint8_t bus; /**< Device bus */ + uint8_t devid; /**< Device ID */ + uint8_t function; /**< Device function. */ + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + __extension__ + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; + unsigned int mtu; + char mac_addr[6]; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/lib/librte_eal/linux/eal/meson.build b/lib/librte_eal/linux/eal/meson.build new file mode 100644 index 0000000000..7e68b2c0dd --- /dev/null +++ b/lib/librte_eal/linux/eal/meson.build @@ -0,0 +1,29 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +eal_inc += include_directories('include') +install_subdir('include/exec-env', install_dir: get_option('includedir')) + +env_objs = [] +env_headers = [] +env_sources = files('eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_memalloc.c', + 'eal_lcore.c', + 'eal_log.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal_vfio.c', + 'eal_vfio_mp_sync.c', + 'eal.c', + 'eal_memory.c', + 'eal_dev.c', +) + +deps += ['kvargs'] +if has_libnuma == 1 + dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) +endif diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile deleted file mode 100644 index a0fffa98e8..0000000000 --- a/lib/librte_eal/linuxapp/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2010-2014 Intel Corporation - -include $(RTE_SDK)/mk/rte.vars.mk - -DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal -DEPDIRS-kni := eal - -CFLAGS += -DALLOW_EXPERIMENTAL_API - -include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile deleted file mode 100644 index 51deb57974..0000000000 --- a/lib/librte_eal/linuxapp/eal/Makefile +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2010-2016 Intel Corporation - -include $(RTE_SDK)/mk/rte.vars.mk - -LIB = librte_eal.a - -ARCH_DIR ?= $(RTE_ARCH) - -EXPORT_MAP := ../../rte_eal_version.map -VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR) - -LIBABIVER := 9 - -VPATH += $(RTE_SDK)/lib/librte_eal/common - -CFLAGS += -DALLOW_EXPERIMENTAL_API -CFLAGS += -I$(SRCDIR)/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include -CFLAGS += $(WERROR_FLAGS) -O3 - -LDLIBS += -ldl -LDLIBS += -lpthread -LDLIBS += -lgcc_s -LDLIBS += -lrt -LDLIBS += -lrte_kvargs -ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) -LDLIBS += -lnuma -endif - -# specific to linuxapp exec-env -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c - -# from common dir -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c - -# from arch dir -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_hypervisor.c -SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c -SRCS-y += rte_cycles.c - -CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) - -# workaround for a gcc bug with noreturn attribute -# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 -ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) -CFLAGS_eal_thread.o += -Wno-return-type -endif - -INC := rte_kni_common.h - -SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ - $(addprefix include/exec-env/,$(INC)) - -include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c deleted file mode 100644 index 13f4016841..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ /dev/null @@ -1,1336 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation. - * Copyright(c) 2012-2014 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(RTE_ARCH_X86) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_hugepages.h" -#include "eal_options.h" -#include "eal_vfio.h" - -#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) - -#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) - -/* Allow the application to print its usage message too if set */ -static rte_usage_hook_t rte_application_usage_hook = NULL; - -/* early configuration structure, when memory config is not mmapped */ -static struct rte_mem_config early_mem_config; - -/* define fd variable here, because file needs to be kept open for the - * duration of the program, as we hold a write lock on it in the primary proc */ -static int mem_cfg_fd = -1; - -static struct flock wr_lock = { - .l_type = F_WRLCK, - .l_whence = SEEK_SET, - .l_start = offsetof(struct rte_mem_config, memsegs), - .l_len = sizeof(early_mem_config.memsegs), -}; - -/* Address of global and public configuration */ -static struct rte_config rte_config = { - .mem_config = &early_mem_config, -}; - -/* internal configuration (per-core) */ -struct lcore_config lcore_config[RTE_MAX_LCORE]; - -/* internal configuration */ -struct internal_config internal_config; - -/* used by rte_rdtsc() */ -int rte_cycles_vmware_tsc_map; - -/* platform-specific runtime dir */ -static char runtime_dir[PATH_MAX]; - -static const char *default_runtime_dir = "/var/run"; - -int -eal_create_runtime_dir(void) -{ - const char *directory = default_runtime_dir; - const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); - const char *fallback = "/tmp"; - char tmp[PATH_MAX]; - int ret; - - if (getuid() != 0) { - /* try XDG path first, fall back to /tmp */ - if (xdg_runtime_dir != NULL) - directory = xdg_runtime_dir; - else - directory = fallback; - } - /* create DPDK subdirectory under runtime dir */ - ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); - if (ret < 0 || ret == sizeof(tmp)) { - RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); - return -1; - } - - /* create prefix-specific subdirectory under DPDK runtime dir */ - ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", - tmp, eal_get_hugefile_prefix()); - if (ret < 0 || ret == sizeof(runtime_dir)) { - RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); - return -1; - } - - /* create the path if it doesn't exist. no "mkdir -p" here, so do it - * step by step. - */ - ret = mkdir(tmp, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - tmp, strerror(errno)); - return -1; - } - - ret = mkdir(runtime_dir, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - runtime_dir, strerror(errno)); - return -1; - } - - return 0; -} - -int -eal_clean_runtime_dir(void) -{ - DIR *dir; - struct dirent *dirent; - int dir_fd, fd, lck_result; - static const char * const filters[] = { - "fbarray_*", - "mp_socket_*" - }; - - /* open directory */ - dir = opendir(runtime_dir); - if (!dir) { - RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n", - runtime_dir); - goto error; - } - dir_fd = dirfd(dir); - - /* lock the directory before doing anything, to avoid races */ - if (flock(dir_fd, LOCK_EX) < 0) { - RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n", - runtime_dir); - goto error; - } - - dirent = readdir(dir); - if (!dirent) { - RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n", - runtime_dir); - goto error; - } - - while (dirent != NULL) { - unsigned int f_idx; - bool skip = true; - - /* skip files that don't match the patterns */ - for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) { - const char *filter = filters[f_idx]; - - if (fnmatch(filter, dirent->d_name, 0) == 0) { - skip = false; - break; - } - } - if (skip) { - dirent = readdir(dir); - continue; - } - - /* try and lock the file */ - fd = openat(dir_fd, dirent->d_name, O_RDONLY); - - /* skip to next file */ - if (fd == -1) { - dirent = readdir(dir); - continue; - } - - /* non-blocking lock */ - lck_result = flock(fd, LOCK_EX | LOCK_NB); - - /* if lock succeeds, remove the file */ - if (lck_result != -1) - unlinkat(dir_fd, dirent->d_name, 0); - close(fd); - dirent = readdir(dir); - } - - /* closedir closes dir_fd and drops the lock */ - closedir(dir); - return 0; - -error: - if (dir) - closedir(dir); - - RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n", - strerror(errno)); - - return -1; -} - -const char * -rte_eal_get_runtime_dir(void) -{ - return runtime_dir; -} - -/* Return user provided mbuf pool ops name */ -const char * -rte_eal_mbuf_user_pool_ops(void) -{ - return internal_config.user_mbuf_pool_ops_name; -} - -/* Return a pointer to the configuration structure */ -struct rte_config * -rte_eal_get_configuration(void) -{ - return &rte_config; -} - -enum rte_iova_mode -rte_eal_iova_mode(void) -{ - return rte_eal_get_configuration()->iova_mode; -} - -/* parse a sysfs (or other) file containing one integer value */ -int -eal_parse_sysfs_value(const char *filename, unsigned long *val) -{ - FILE *f; - char buf[BUFSIZ]; - char *end = NULL; - - if ((f = fopen(filename, "r")) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", - __func__, filename); - return -1; - } - - if (fgets(buf, sizeof(buf), f) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - *val = strtoul(buf, &end, 0); - if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { - RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - fclose(f); - return 0; -} - - -/* create memory configuration in shared/mmap memory. Take out - * a write lock on the memsegs, so we can auto-detect primary/secondary. - * This means we never close the file while running (auto-close on exit). - * We also don't lock the whole file, so that in future we can use read-locks - * on other parts, e.g. memzones, to detect if there are running secondary - * processes. */ -static void -rte_eal_config_create(void) -{ - void *rte_mem_cfg_addr; - int retval; - - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return; - - /* map the config before hugepage address so that we don't waste a page */ - if (internal_config.base_virtaddr != 0) - rte_mem_cfg_addr = (void *) - RTE_ALIGN_FLOOR(internal_config.base_virtaddr - - sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE)); - else - rte_mem_cfg_addr = NULL; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660); - if (mem_cfg_fd < 0) - rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); - } - - retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config)); - if (retval < 0){ - close(mem_cfg_fd); - rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname); - } - - retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); - if (retval < 0){ - close(mem_cfg_fd); - rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary " - "process running?\n", pathname); - } - - rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config), - PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0); - - if (rte_mem_cfg_addr == MAP_FAILED){ - rte_panic("Cannot mmap memory for rte_config\n"); - } - memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); - rte_config.mem_config = rte_mem_cfg_addr; - - /* store address of the config in the config itself so that secondary - * processes could later map the config into this exact location */ - rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; - - rte_config.mem_config->dma_maskbits = 0; - -} - -/* attach to an existing shared memory config */ -static void -rte_eal_config_attach(void) -{ - struct rte_mem_config *mem_config; - - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR); - if (mem_cfg_fd < 0) - rte_panic("Cannot open '%s' for rte_mem_config\n", pathname); - } - - /* map it as read-only first */ - mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), - PROT_READ, MAP_SHARED, mem_cfg_fd, 0); - if (mem_config == MAP_FAILED) - rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - - rte_config.mem_config = mem_config; -} - -/* reattach the shared config at exact memory location primary process has it */ -static void -rte_eal_config_reattach(void) -{ - struct rte_mem_config *mem_config; - void *rte_mem_cfg_addr; - - if (internal_config.no_shconf) - return; - - /* save the address primary process has mapped shared config to */ - rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; - - /* unmap original config */ - munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); - - /* remap the config at proper address */ - mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, - sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, - mem_cfg_fd, 0); - if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { - if (mem_config != MAP_FAILED) - /* errno is stale, don't use */ - rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]" - " - please use '--base-virtaddr' option\n", - rte_mem_cfg_addr, mem_config); - else - rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - } - close(mem_cfg_fd); - - rte_config.mem_config = mem_config; -} - -/* Detect if we are a primary or a secondary process */ -enum rte_proc_type_t -eal_proc_type_detect(void) -{ - enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; - const char *pathname = eal_runtime_config_path(); - - /* if there no shared config, there can be no secondary processes */ - if (!internal_config.no_shconf) { - /* if we can open the file but not get a write-lock we are a - * secondary process. NOTE: if we get a file handle back, we - * keep that open and don't close it to prevent a race condition - * between multiple opens. - */ - if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && - (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) - ptype = RTE_PROC_SECONDARY; - } - - RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", - ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); - - return ptype; -} - -/* copies data from internal config to shared config */ -static void -eal_update_mem_config(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - mcfg->legacy_mem = internal_config.legacy_mem; - mcfg->single_file_segments = internal_config.single_file_segments; -} - -/* copies data from shared config to internal config */ -static void -eal_update_internal_config(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - internal_config.legacy_mem = mcfg->legacy_mem; - internal_config.single_file_segments = mcfg->single_file_segments; -} - -/* Sets up rte_config structure with the pointer to shared memory config.*/ -static void -rte_config_init(void) -{ - rte_config.process_type = internal_config.process_type; - - switch (rte_config.process_type){ - case RTE_PROC_PRIMARY: - rte_eal_config_create(); - eal_update_mem_config(); - break; - case RTE_PROC_SECONDARY: - rte_eal_config_attach(); - rte_eal_mcfg_wait_complete(rte_config.mem_config); - rte_eal_config_reattach(); - eal_update_internal_config(); - break; - case RTE_PROC_AUTO: - case RTE_PROC_INVALID: - rte_panic("Invalid process type\n"); - } -} - -/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ -static void -eal_hugedirs_unlock(void) -{ - int i; - - for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) - { - /* skip uninitialized */ - if (internal_config.hugepage_info[i].lock_descriptor < 0) - continue; - /* unlock hugepage file */ - flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); - close(internal_config.hugepage_info[i].lock_descriptor); - /* reset the field */ - internal_config.hugepage_info[i].lock_descriptor = -1; - } -} - -/* display usage */ -static void -eal_usage(const char *prgname) -{ - printf("\nUsage: %s ", prgname); - eal_common_usage(); - printf("EAL Linux options:\n" - " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" - " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" - " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" - " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" - " --"OPT_BASE_VIRTADDR" Base virtual address\n" - " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" - " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" - " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" - " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" - " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n" - "\n"); - /* Allow the application to print its usage message too if hook is set */ - if ( rte_application_usage_hook ) { - printf("===== Application Usage =====\n\n"); - rte_application_usage_hook(prgname); - } -} - -/* Set a per-application usage message */ -rte_usage_hook_t -rte_set_application_usage_hook( rte_usage_hook_t usage_func ) -{ - rte_usage_hook_t old_func; - - /* Will be NULL on the first call to denote the last usage routine. */ - old_func = rte_application_usage_hook; - rte_application_usage_hook = usage_func; - - return old_func; -} - -static int -eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) -{ - char * arg[RTE_MAX_NUMA_NODES]; - char *end; - int arg_num, i, len; - uint64_t total_mem = 0; - - len = strnlen(strval, SOCKET_MEM_STRLEN); - if (len == SOCKET_MEM_STRLEN) { - RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); - return -1; - } - - /* all other error cases will be caught later */ - if (!isdigit(strval[len-1])) - return -1; - - /* split the optarg into separate socket values */ - arg_num = rte_strsplit(strval, len, - arg, RTE_MAX_NUMA_NODES, ','); - - /* if split failed, or 0 arguments */ - if (arg_num <= 0) - return -1; - - /* parse each defined socket option */ - errno = 0; - for (i = 0; i < arg_num; i++) { - uint64_t val; - end = NULL; - val = strtoull(arg[i], &end, 10); - - /* check for invalid input */ - if ((errno != 0) || - (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) - return -1; - val <<= 20; - total_mem += val; - socket_arg[i] = val; - } - - return 0; -} - -static int -eal_parse_base_virtaddr(const char *arg) -{ - char *end; - uint64_t addr; - - errno = 0; - addr = strtoull(arg, &end, 16); - - /* check for errors */ - if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0')) - return -1; - - /* make sure we don't exceed 32-bit boundary on 32-bit target */ -#ifndef RTE_ARCH_64 - if (addr >= UINTPTR_MAX) - return -1; -#endif - - /* align the addr on 16M boundary, 16MB is the minimum huge page - * size on IBM Power architecture. If the addr is aligned to 16MB, - * it can align to 2MB for x86. So this alignment can also be used - * on x86 */ - internal_config.base_virtaddr = - RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M); - - return 0; -} - -static int -eal_parse_vfio_intr(const char *mode) -{ - unsigned i; - static struct { - const char *name; - enum rte_intr_mode value; - } map[] = { - { "legacy", RTE_INTR_MODE_LEGACY }, - { "msi", RTE_INTR_MODE_MSI }, - { "msix", RTE_INTR_MODE_MSIX }, - }; - - for (i = 0; i < RTE_DIM(map); i++) { - if (!strcmp(mode, map[i].name)) { - internal_config.vfio_intr_mode = map[i].value; - return 0; - } - } - return -1; -} - -/* Parse the arguments for --log-level only */ -static void -eal_log_level_parse(int argc, char **argv) -{ - int opt; - char **argvopt; - int option_index; - const int old_optind = optind; - const int old_optopt = optopt; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - int ret; - - /* getopt is not happy, stop right now */ - if (opt == '?') - break; - - ret = (opt == OPT_LOG_LEVEL_NUM) ? - eal_parse_common_option(opt, optarg, &internal_config) : 0; - - /* common parser is not happy */ - if (ret < 0) - break; - } - - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optarg = old_optarg; -} - -/* Parse the argument given in the command line of the application */ -static int -eal_parse_args(int argc, char **argv) -{ - int opt, ret; - char **argvopt; - int option_index; - char *prgname = argv[0]; - const int old_optind = optind; - const int old_optopt = optopt; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - opterr = 0; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - /* - * getopt didn't recognise the option, lets parse the - * registered options to see if the flag is valid - */ - if (opt == '?') { - ret = rte_option_parse(argv[optind-1]); - if (ret == 0) - continue; - - eal_usage(prgname); - ret = -1; - goto out; - } - - ret = eal_parse_common_option(opt, optarg, &internal_config); - /* common parser is not happy */ - if (ret < 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - /* common parser handled this option */ - if (ret == 0) - continue; - - switch (opt) { - case 'h': - eal_usage(prgname); - exit(EXIT_SUCCESS); - - case OPT_HUGE_DIR_NUM: - { - char *hdir = strdup(optarg); - if (hdir == NULL) - RTE_LOG(ERR, EAL, "Could not store hugepage directory\n"); - else { - /* free old hugepage dir */ - if (internal_config.hugepage_dir != NULL) - free(internal_config.hugepage_dir); - internal_config.hugepage_dir = hdir; - } - break; - } - case OPT_FILE_PREFIX_NUM: - { - char *prefix = strdup(optarg); - if (prefix == NULL) - RTE_LOG(ERR, EAL, "Could not store file prefix\n"); - else { - /* free old prefix */ - if (internal_config.hugefile_prefix != NULL) - free(internal_config.hugefile_prefix); - internal_config.hugefile_prefix = prefix; - } - break; - } - case OPT_SOCKET_MEM_NUM: - if (eal_parse_socket_arg(optarg, - internal_config.socket_mem) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_SOCKET_MEM "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - internal_config.force_sockets = 1; - break; - - case OPT_SOCKET_LIMIT_NUM: - if (eal_parse_socket_arg(optarg, - internal_config.socket_limit) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_SOCKET_LIMIT "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - internal_config.force_socket_limits = 1; - break; - - case OPT_BASE_VIRTADDR_NUM: - if (eal_parse_base_virtaddr(optarg) < 0) { - RTE_LOG(ERR, EAL, "invalid parameter for --" - OPT_BASE_VIRTADDR "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - break; - - case OPT_VFIO_INTR_NUM: - if (eal_parse_vfio_intr(optarg) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_VFIO_INTR "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - break; - - case OPT_CREATE_UIO_DEV_NUM: - internal_config.create_uio_dev = 1; - break; - - case OPT_MBUF_POOL_OPS_NAME_NUM: - { - char *ops_name = strdup(optarg); - if (ops_name == NULL) - RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); - else { - /* free old ops name */ - if (internal_config.user_mbuf_pool_ops_name != - NULL) - free(internal_config.user_mbuf_pool_ops_name); - - internal_config.user_mbuf_pool_ops_name = - ops_name; - } - break; - } - case OPT_MATCH_ALLOCATIONS_NUM: - internal_config.match_allocations = 1; - break; - - default: - if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { - RTE_LOG(ERR, EAL, "Option %c is not supported " - "on Linux\n", opt); - } else if (opt >= OPT_LONG_MIN_NUM && - opt < OPT_LONG_MAX_NUM) { - RTE_LOG(ERR, EAL, "Option %s is not supported " - "on Linux\n", - eal_long_options[option_index].name); - } else { - RTE_LOG(ERR, EAL, "Option %d is not supported " - "on Linux\n", opt); - } - eal_usage(prgname); - ret = -1; - goto out; - } - } - - /* create runtime data directory */ - if (internal_config.no_shconf == 0 && - eal_create_runtime_dir() < 0) { - RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); - ret = -1; - goto out; - } - - if (eal_adjust_config(&internal_config) != 0) { - ret = -1; - goto out; - } - - /* sanity checks */ - if (eal_check_common_options(&internal_config) != 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - - if (optind >= 0) - argv[optind-1] = prgname; - ret = optind-1; - -out: - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optarg = old_optarg; - - return ret; -} - -static int -check_socket(const struct rte_memseg_list *msl, void *arg) -{ - int *socket_id = arg; - - if (msl->external) - return 0; - - return *socket_id == msl->socket_id; -} - -static void -eal_check_mem_on_local_socket(void) -{ - int socket_id; - - socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); - - if (rte_memseg_list_walk(check_socket, &socket_id) == 0) - RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); -} - -static int -sync_func(__attribute__((unused)) void *arg) -{ - return 0; -} - -inline static void -rte_eal_mcfg_complete(void) -{ - /* ALL shared mem_config related INIT DONE */ - if (rte_config.process_type == RTE_PROC_PRIMARY) - rte_config.mem_config->magic = RTE_MAGIC; - - internal_config.init_complete = 1; -} - -/* - * Request iopl privilege for all RPL, returns 0 on success - * iopl() call is mostly for the i386 architecture. For other architectures, - * return -1 to indicate IO privilege can't be changed in this way. - */ -int -rte_eal_iopl_init(void) -{ -#if defined(RTE_ARCH_X86) - if (iopl(3) != 0) - return -1; -#endif - return 0; -} - -#ifdef VFIO_PRESENT -static int rte_eal_vfio_setup(void) -{ - if (rte_vfio_enable("vfio")) - return -1; - - return 0; -} -#endif - -static void rte_eal_init_alert(const char *msg) -{ - fprintf(stderr, "EAL: FATAL: %s\n", msg); - RTE_LOG(ERR, EAL, "%s\n", msg); -} - -/* Launch threads, called at application init(). */ -int -rte_eal_init(int argc, char **argv) -{ - int i, fctret, ret; - pthread_t thread_id; - static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - const char *p; - static char logid[PATH_MAX]; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - - /* checks if the machine is adequate */ - if (!rte_cpu_is_supported()) { - rte_eal_init_alert("unsupported cpu type."); - rte_errno = ENOTSUP; - return -1; - } - - if (!rte_atomic32_test_and_set(&run_once)) { - rte_eal_init_alert("already called initialization."); - rte_errno = EALREADY; - return -1; - } - - p = strrchr(argv[0], '/'); - strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); - thread_id = pthread_self(); - - eal_reset_internal_config(&internal_config); - - /* set log level as early as possible */ - eal_log_level_parse(argc, argv); - - if (rte_eal_cpu_init() < 0) { - rte_eal_init_alert("Cannot detect lcores."); - rte_errno = ENOTSUP; - return -1; - } - - fctret = eal_parse_args(argc, argv); - if (fctret < 0) { - rte_eal_init_alert("Invalid 'command line' arguments."); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins"); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - if (eal_option_device_parse()) { - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - rte_config_init(); - - if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread"); - return -1; - } - - /* Put mp channel init before bus scan so that we can init the vdev - * bus through mp channel in the secondary process before the bus scan. - */ - if (rte_mp_channel_init() < 0) { - rte_eal_init_alert("failed to init mp channel"); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - rte_errno = EFAULT; - return -1; - } - } - - /* register multi-process action callbacks for hotplug */ - if (rte_mp_dev_hotplug_init() < 0) { - rte_eal_init_alert("failed to register mp callback for hotplug"); - return -1; - } - - if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices"); - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - /* if no EAL option "--iova-mode=", use bus IOVA scheme */ - if (internal_config.iova_mode == RTE_IOVA_DC) { - /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ - rte_eal_get_configuration()->iova_mode = - rte_bus_get_iommu_class(); - - /* Workaround for KNI which requires physical address to work */ - if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA && - rte_eal_check_module("rte_kni") == 1) { - rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA; - RTE_LOG(WARNING, EAL, - "Some devices want IOVA as VA but PA will be used because.. " - "KNI module inserted\n"); - } - } else { - rte_eal_get_configuration()->iova_mode = - internal_config.iova_mode; - } - - if (internal_config.no_hugetlbfs == 0) { - /* rte_config isn't initialized yet */ - ret = internal_config.process_type == RTE_PROC_PRIMARY ? - eal_hugepage_info_init() : - eal_hugepage_info_read(); - if (ret < 0) { - rte_eal_init_alert("Cannot get hugepage information."); - rte_errno = EACCES; - rte_atomic32_clear(&run_once); - return -1; - } - } - - if (internal_config.memory == 0 && internal_config.force_sockets == 0) { - if (internal_config.no_hugetlbfs) - internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; - } - - if (internal_config.vmware_tsc_map == 1) { -#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT - rte_cycles_vmware_tsc_map = 1; - RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " - "you must have monitor_control.pseudo_perfctr = TRUE\n"); -#else - RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " - "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); -#endif - } - - rte_srand(rte_rdtsc()); - - if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { - rte_eal_init_alert("Cannot init logging."); - rte_errno = ENOMEM; - rte_atomic32_clear(&run_once); - return -1; - } - -#ifdef VFIO_PRESENT - if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO"); - rte_errno = EAGAIN; - rte_atomic32_clear(&run_once); - return -1; - } -#endif - /* in secondary processes, memory init may allocate additional fbarrays - * not present in primary processes, so to avoid any potential issues, - * initialize memzones first. - */ - if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory"); - rte_errno = ENOMEM; - return -1; - } - - /* the directories are locked during eal_hugepage_info_init */ - eal_hugedirs_unlock(); - - if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects"); - rte_errno = EFAULT; - return -1; - } - - if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread"); - /* rte_eal_alarm_init sets rte_errno on failure. */ - return -1; - } - - if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers"); - rte_errno = ENOTSUP; - return -1; - } - - eal_check_mem_on_local_socket(); - - eal_thread_init_master(rte_config.master_lcore); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", - rte_config.master_lcore, (uintptr_t)thread_id, cpuset, - ret == 0 ? "" : "..."); - - RTE_LCORE_FOREACH_SLAVE(i) { - - /* - * create communication pipes between master thread - * and children - */ - if (pipe(lcore_config[i].pipe_master2slave) < 0) - rte_panic("Cannot create pipe\n"); - if (pipe(lcore_config[i].pipe_slave2master) < 0) - rte_panic("Cannot create pipe\n"); - - lcore_config[i].state = WAIT; - - /* create a thread for each lcore */ - ret = pthread_create(&lcore_config[i].thread_id, NULL, - eal_thread_loop, NULL); - if (ret != 0) - rte_panic("Cannot create thread\n"); - - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, sizeof(thread_name), - "lcore-slave-%d", i); - ret = rte_thread_setname(lcore_config[i].thread_id, - thread_name); - if (ret != 0) - RTE_LOG(DEBUG, EAL, - "Cannot set name for lcore thread\n"); - } - - /* - * Launch a dummy function on all slave lcores, so that master lcore - * knows they are all ready when this function returns. - */ - rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); - rte_eal_mp_wait_lcore(); - - /* initialize services so vdevs register service during bus_probe. */ - ret = rte_service_init(); - if (ret) { - rte_eal_init_alert("rte_service_init() failed"); - rte_errno = ENOEXEC; - return -1; - } - - /* Probe all the buses and devices/drivers on them */ - if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices"); - rte_errno = ENOTSUP; - return -1; - } - -#ifdef VFIO_PRESENT - /* Register mp action after probe() so that we got enough info */ - if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) - return -1; -#endif - - /* initialize default service/lcore mappings and start running. Ignore - * -ENOTSUP, as it indicates no service coremask passed to EAL. - */ - ret = rte_service_start_with_defaults(); - if (ret < 0 && ret != -ENOTSUP) { - rte_errno = ENOEXEC; - return -1; - } - - /* - * Clean up unused files in runtime directory. We do this at the end of - * init and not at the beginning because we want to clean stuff up - * whether we are primary or secondary process, but we cannot remove - * primary process' files because secondary should be able to run even - * if primary process is dead. - * - * In no_shconf mode, no runtime directory is created in the first - * place, so no cleanup needed. - */ - if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { - rte_eal_init_alert("Cannot clear runtime directory\n"); - return -1; - } - - rte_eal_mcfg_complete(); - - /* Call each registered callback, if enabled */ - rte_option_init(); - - return fctret; -} - -static int -mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - void *arg __rte_unused) -{ - /* ms is const, so find this memseg */ - struct rte_memseg *found; - - if (msl->external) - return 0; - - found = rte_mem_virt2memseg(ms->addr, msl); - - found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; - - return 0; -} - -int __rte_experimental -rte_eal_cleanup(void) -{ - /* if we're in a primary process, we need to mark hugepages as freeable - * so that finalization can release them back to the system. - */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_memseg_walk(mark_freeable, NULL); - rte_service_finalize(); - rte_mp_channel_cleanup(); - eal_cleanup_config(&internal_config); - return 0; -} - -/* get core role */ -enum rte_lcore_role_t -rte_eal_lcore_role(unsigned lcore_id) -{ - return rte_config.lcore_role[lcore_id]; -} - -enum rte_proc_type_t -rte_eal_process_type(void) -{ - return rte_config.process_type; -} - -int rte_eal_has_hugepages(void) -{ - return ! internal_config.no_hugetlbfs; -} - -int rte_eal_has_pci(void) -{ - return !internal_config.no_pci; -} - -int rte_eal_create_uio_dev(void) -{ - return internal_config.create_uio_dev; -} - -enum rte_intr_mode -rte_eal_vfio_intr_mode(void) -{ - return internal_config.vfio_intr_mode; -} - -int -rte_eal_check_module(const char *module_name) -{ - char sysfs_mod_name[PATH_MAX]; - struct stat st; - int n; - - if (NULL == module_name) - return -1; - - /* Check if there is sysfs mounted */ - if (stat("/sys/module", &st) != 0) { - RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - /* A module might be built-in, therefore try sysfs */ - n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); - if (n < 0 || n > PATH_MAX) { - RTE_LOG(DEBUG, EAL, "Could not format module path\n"); - return -1; - } - - if (stat(sysfs_mod_name, &st) != 0) { - RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", - sysfs_mod_name, errno, strerror(errno)); - return 0; - } - - /* Module has been found */ - return 1; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c deleted file mode 100644 index 840ede7806..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c +++ /dev/null @@ -1,243 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef TFD_NONBLOCK -#include -#define TFD_NONBLOCK O_NONBLOCK -#endif - -#define NS_PER_US 1000 -#define US_PER_MS 1000 -#define MS_PER_S 1000 -#ifndef US_PER_S -#define US_PER_S (US_PER_MS * MS_PER_S) -#endif - -#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ -#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW -#else -#define CLOCK_TYPE_ID CLOCK_MONOTONIC -#endif - -struct alarm_entry { - LIST_ENTRY(alarm_entry) next; - struct timeval time; - rte_eal_alarm_callback cb_fn; - void *cb_arg; - volatile uint8_t executing; - volatile pthread_t executing_id; -}; - -static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); -static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; - -static struct rte_intr_handle intr_handle = {.fd = -1 }; -static int handler_registered = 0; -static void eal_alarm_callback(void *arg); - -int -rte_eal_alarm_init(void) -{ - intr_handle.type = RTE_INTR_HANDLE_ALARM; - /* create a timerfd file descriptor */ - intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); - if (intr_handle.fd == -1) - goto error; - - return 0; - -error: - rte_errno = errno; - return -1; -} - -static void -eal_alarm_callback(void *arg __rte_unused) -{ - struct timespec now; - struct alarm_entry *ap; - - rte_spinlock_lock(&alarm_list_lk); - while ((ap = LIST_FIRST(&alarm_list)) !=NULL && - clock_gettime(CLOCK_TYPE_ID, &now) == 0 && - (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && - (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { - ap->executing = 1; - ap->executing_id = pthread_self(); - rte_spinlock_unlock(&alarm_list_lk); - - ap->cb_fn(ap->cb_arg); - - rte_spinlock_lock(&alarm_list_lk); - - LIST_REMOVE(ap, next); - free(ap); - } - - if (!LIST_EMPTY(&alarm_list)) { - struct itimerspec atime = { .it_interval = { 0, 0 } }; - - ap = LIST_FIRST(&alarm_list); - atime.it_value.tv_sec = ap->time.tv_sec; - atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; - /* perform borrow for subtraction if necessary */ - if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) - atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; - - atime.it_value.tv_sec -= now.tv_sec; - atime.it_value.tv_nsec -= now.tv_nsec; - timerfd_settime(intr_handle.fd, 0, &atime, NULL); - } - rte_spinlock_unlock(&alarm_list_lk); -} - -int -rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct timespec now; - int ret = 0; - struct alarm_entry *ap, *new_alarm; - - /* Check parameters, including that us won't cause a uint64_t overflow */ - if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) - return -EINVAL; - - new_alarm = calloc(1, sizeof(*new_alarm)); - if (new_alarm == NULL) - return -ENOMEM; - - /* use current time to calculate absolute time of alarm */ - clock_gettime(CLOCK_TYPE_ID, &now); - - new_alarm->cb_fn = cb_fn; - new_alarm->cb_arg = cb_arg; - new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; - new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); - - rte_spinlock_lock(&alarm_list_lk); - if (!handler_registered) { - ret |= rte_intr_callback_register(&intr_handle, - eal_alarm_callback, NULL); - handler_registered = (ret == 0) ? 1 : 0; - } - - if (LIST_EMPTY(&alarm_list)) - LIST_INSERT_HEAD(&alarm_list, new_alarm, next); - else { - LIST_FOREACH(ap, &alarm_list, next) { - if (ap->time.tv_sec > new_alarm->time.tv_sec || - (ap->time.tv_sec == new_alarm->time.tv_sec && - ap->time.tv_usec > new_alarm->time.tv_usec)){ - LIST_INSERT_BEFORE(ap, new_alarm, next); - break; - } - if (LIST_NEXT(ap, next) == NULL) { - LIST_INSERT_AFTER(ap, new_alarm, next); - break; - } - } - } - - if (LIST_FIRST(&alarm_list) == new_alarm) { - struct itimerspec alarm_time = { - .it_interval = {0, 0}, - .it_value = { - .tv_sec = us / US_PER_S, - .tv_nsec = (us % US_PER_S) * NS_PER_US, - }, - }; - ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); - } - rte_spinlock_unlock(&alarm_list_lk); - - return ret; -} - -int -rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct alarm_entry *ap, *ap_prev; - int count = 0; - int err = 0; - int executing; - - if (!cb_fn) { - rte_errno = EINVAL; - return -1; - } - - do { - executing = 0; - rte_spinlock_lock(&alarm_list_lk); - /* remove any matches at the start of the list */ - while ((ap = LIST_FIRST(&alarm_list)) != NULL && - cb_fn == ap->cb_fn && - (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { - - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - } else { - /* If calling from other context, mark that alarm is executing - * so loop can spin till it finish. Otherwise we are trying to - * cancel our self - mark it by EINPROGRESS */ - if (pthread_equal(ap->executing_id, pthread_self()) == 0) - executing++; - else - err = EINPROGRESS; - - break; - } - } - ap_prev = ap; - - /* now go through list, removing entries not at start */ - LIST_FOREACH(ap, &alarm_list, next) { - /* this won't be true first time through */ - if (cb_fn == ap->cb_fn && - (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { - - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - ap = ap_prev; - } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) - executing++; - else - err = EINPROGRESS; - } - ap_prev = ap; - } - rte_spinlock_unlock(&alarm_list_lk); - } while (executing != 0); - - if (count == 0 && err == 0) - rte_errno = ENOENT; - else if (err) - rte_errno = err; - - return count; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_cpuflags.c b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c deleted file mode 100644 index d38296e1e5..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_cpuflags.c +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2018 Red Hat, Inc. - */ - -#include -#include -#include -#include -#include -#include - -#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 16) -#include -#define HAS_AUXV 1 -#endif -#endif - -#include - -#ifndef HAS_AUXV -static unsigned long -getauxval(unsigned long type __rte_unused) -{ - errno = ENOTSUP; - return 0; -} -#endif - -#ifdef RTE_ARCH_64 -typedef Elf64_auxv_t Internal_Elfx_auxv_t; -#else -typedef Elf32_auxv_t Internal_Elfx_auxv_t; -#endif - -/** - * Provides a method for retrieving values from the auxiliary vector and - * possibly running a string comparison. - * - * @return Always returns a result. When the result is 0, check errno - * to see if an error occurred during processing. - */ -static unsigned long -_rte_cpu_getauxval(unsigned long type, const char *str) -{ - unsigned long val; - - errno = 0; - val = getauxval(type); - - if (!val && (errno == ENOTSUP || errno == ENOENT)) { - int auxv_fd = open("/proc/self/auxv", O_RDONLY); - Internal_Elfx_auxv_t auxv; - - if (auxv_fd == -1) - return 0; - - errno = ENOENT; - while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { - if (auxv.a_type == type) { - errno = 0; - val = auxv.a_un.a_val; - if (str) - val = strcmp((const char *)val, str); - break; - } - } - close(auxv_fd); - } - - return val; -} - -unsigned long -rte_cpu_getauxval(unsigned long type) -{ - return _rte_cpu_getauxval(type, NULL); -} - -int -rte_cpu_strcmp_auxval(unsigned long type, const char *str) -{ - return _rte_cpu_getauxval(type, str); -} diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c deleted file mode 100644 index 5d92500bf5..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_debug.c +++ /dev/null @@ -1,92 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#ifdef RTE_BACKTRACE -#include -#endif -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define BACKTRACE_SIZE 256 - -/* dump the stack of the calling core */ -void rte_dump_stack(void) -{ -#ifdef RTE_BACKTRACE - void *func[BACKTRACE_SIZE]; - char **symb = NULL; - int size; - - size = backtrace(func, BACKTRACE_SIZE); - symb = backtrace_symbols(func, size); - - if (symb == NULL) - return; - - while (size > 0) { - rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, - "%d: [%s]\n", size, symb[size - 1]); - size --; - } - - free(symb); -#endif /* RTE_BACKTRACE */ -} - -/* not implemented in this environment */ -void rte_dump_registers(void) -{ - return; -} - -/* call abort(), it will generate a coredump if enabled */ -void __rte_panic(const char *funcname, const char *format, ...) -{ - va_list ap; - - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - rte_dump_stack(); - rte_dump_registers(); - abort(); -} - -/* - * Like rte_panic this terminates the application. However, no traceback is - * provided and no core-dump is generated. - */ -void -rte_exit(int exit_code, const char *format, ...) -{ - va_list ap; - - if (exit_code != 0) - RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" - " Cause: ", exit_code); - - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - -#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR - if (rte_eal_cleanup() != 0) - RTE_LOG(CRIT, EAL, - "EAL could not release all resources\n"); - exit(exit_code); -#else - rte_dump_stack(); - rte_dump_registers(); - abort(); -#endif -} diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c deleted file mode 100644 index 2830c8687d..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_dev.c +++ /dev/null @@ -1,396 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2018 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" - -static struct rte_intr_handle intr_handle = {.fd = -1 }; -static bool monitor_started; -static bool hotplug_handle; - -#define EAL_UEV_MSG_LEN 4096 -#define EAL_UEV_MSG_ELEM_LEN 128 - -/* - * spinlock for device hot-unplug failure handling. If it try to access bus or - * device, such as handle sigbus on bus or handle memory failure for device - * just need to use this lock. It could protect the bus and the device to avoid - * race condition. - */ -static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; - -static struct sigaction sigbus_action_old; - -static int sigbus_need_recover; - -static void dev_uev_handler(__rte_unused void *param); - -/* identify the system layer which reports this event. */ -enum eal_dev_event_subsystem { - EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ - EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ - EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ - EAL_DEV_EVENT_SUBSYSTEM_MAX -}; - -static void -sigbus_action_recover(void) -{ - if (sigbus_need_recover) { - sigaction(SIGBUS, &sigbus_action_old, NULL); - sigbus_need_recover = 0; - } -} - -static void sigbus_handler(int signum, siginfo_t *info, - void *ctx __rte_unused) -{ - int ret; - - RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n", - (int)pthread_self(), info->si_addr); - - rte_spinlock_lock(&failure_handle_lock); - ret = rte_bus_sigbus_handler(info->si_addr); - rte_spinlock_unlock(&failure_handle_lock); - if (ret == -1) { - rte_exit(EXIT_FAILURE, - "Failed to handle SIGBUS for hot-unplug, " - "(rte_errno: %s)!", strerror(rte_errno)); - } else if (ret == 1) { - if (sigbus_action_old.sa_flags == SA_SIGINFO - && sigbus_action_old.sa_sigaction) { - (*(sigbus_action_old.sa_sigaction))(signum, - info, ctx); - } else if (sigbus_action_old.sa_flags != SA_SIGINFO - && sigbus_action_old.sa_handler) { - (*(sigbus_action_old.sa_handler))(signum); - } else { - rte_exit(EXIT_FAILURE, - "Failed to handle generic SIGBUS!"); - } - } - - RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); -} - -static int cmp_dev_name(const struct rte_device *dev, - const void *_name) -{ - const char *name = _name; - - return strcmp(dev->name, name); -} - -static int -dev_uev_socket_fd_create(void) -{ - struct sockaddr_nl addr; - int ret; - - intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | - SOCK_NONBLOCK, - NETLINK_KOBJECT_UEVENT); - if (intr_handle.fd < 0) { - RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); - return -1; - } - - memset(&addr, 0, sizeof(addr)); - addr.nl_family = AF_NETLINK; - addr.nl_pid = 0; - addr.nl_groups = 0xffffffff; - - ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); - goto err; - } - - return 0; -err: - close(intr_handle.fd); - intr_handle.fd = -1; - return ret; -} - -static int -dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) -{ - char action[EAL_UEV_MSG_ELEM_LEN]; - char subsystem[EAL_UEV_MSG_ELEM_LEN]; - char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; - int i = 0; - - memset(action, 0, EAL_UEV_MSG_ELEM_LEN); - memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); - memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); - - while (i < length) { - for (; i < length; i++) { - if (*buf) - break; - buf++; - } - /** - * check device uevent from kernel side, no need to check - * uevent from udev. - */ - if (!strncmp(buf, "libudev", 7)) { - buf += 7; - i += 7; - return -1; - } - if (!strncmp(buf, "ACTION=", 7)) { - buf += 7; - i += 7; - strlcpy(action, buf, sizeof(action)); - } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { - buf += 10; - i += 10; - strlcpy(subsystem, buf, sizeof(subsystem)); - } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { - buf += 14; - i += 14; - strlcpy(pci_slot_name, buf, sizeof(subsystem)); - event->devname = strdup(pci_slot_name); - } - for (; i < length; i++) { - if (*buf == '\0') - break; - buf++; - } - } - - /* parse the subsystem layer */ - if (!strncmp(subsystem, "uio", 3)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; - else if (!strncmp(subsystem, "pci", 3)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; - else if (!strncmp(subsystem, "vfio", 4)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; - else - return -1; - - /* parse the action type */ - if (!strncmp(action, "add", 3)) - event->type = RTE_DEV_EVENT_ADD; - else if (!strncmp(action, "remove", 6)) - event->type = RTE_DEV_EVENT_REMOVE; - else - return -1; - return 0; -} - -static void -dev_delayed_unregister(void *param) -{ - rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); - close(intr_handle.fd); - intr_handle.fd = -1; -} - -static void -dev_uev_handler(__rte_unused void *param) -{ - struct rte_dev_event uevent; - int ret; - char buf[EAL_UEV_MSG_LEN]; - struct rte_bus *bus; - struct rte_device *dev; - const char *busname = ""; - - memset(&uevent, 0, sizeof(struct rte_dev_event)); - memset(buf, 0, EAL_UEV_MSG_LEN); - - ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); - if (ret < 0 && errno == EAGAIN) - return; - else if (ret <= 0) { - /* connection is closed or broken, can not up again. */ - RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); - rte_eal_alarm_set(1, dev_delayed_unregister, NULL); - return; - } - - ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); - if (ret < 0) { - RTE_LOG(DEBUG, EAL, "It is not an valid event " - "that need to be handle.\n"); - return; - } - - RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", - uevent.devname, uevent.type, uevent.subsystem); - - switch (uevent.subsystem) { - case EAL_DEV_EVENT_SUBSYSTEM_PCI: - case EAL_DEV_EVENT_SUBSYSTEM_UIO: - busname = "pci"; - break; - default: - break; - } - - if (uevent.devname) { - if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { - rte_spinlock_lock(&failure_handle_lock); - bus = rte_bus_find_by_name(busname); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", - busname); - goto failure_handle_err; - } - - dev = bus->find_device(NULL, cmp_dev_name, - uevent.devname); - if (dev == NULL) { - RTE_LOG(ERR, EAL, "Cannot find device (%s) on " - "bus (%s)\n", uevent.devname, busname); - goto failure_handle_err; - } - - ret = bus->hot_unplug_handler(dev); - if (ret) { - RTE_LOG(ERR, EAL, "Can not handle hot-unplug " - "for device (%s)\n", dev->name); - } - rte_spinlock_unlock(&failure_handle_lock); - } - rte_dev_event_callback_process(uevent.devname, uevent.type); - } - - return; - -failure_handle_err: - rte_spinlock_unlock(&failure_handle_lock); -} - -int __rte_experimental -rte_dev_event_monitor_start(void) -{ - int ret; - - if (monitor_started) - return 0; - - ret = dev_uev_socket_fd_create(); - if (ret) { - RTE_LOG(ERR, EAL, "error create device event fd.\n"); - return -1; - } - - intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; - ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); - - if (ret) { - RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); - return -1; - } - - monitor_started = true; - - return 0; -} - -int __rte_experimental -rte_dev_event_monitor_stop(void) -{ - int ret; - - if (!monitor_started) - return 0; - - ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, - (void *)-1); - if (ret < 0) { - RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); - return ret; - } - - close(intr_handle.fd); - intr_handle.fd = -1; - monitor_started = false; - - return 0; -} - -int -dev_sigbus_handler_register(void) -{ - sigset_t mask; - struct sigaction action; - - rte_errno = 0; - - if (sigbus_need_recover) - return 0; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = SA_SIGINFO; - action.sa_mask = mask; - action.sa_sigaction = sigbus_handler; - sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); - - return rte_errno; -} - -int -dev_sigbus_handler_unregister(void) -{ - rte_errno = 0; - - sigbus_action_recover(); - - return rte_errno; -} - -int __rte_experimental -rte_dev_hotplug_handle_enable(void) -{ - int ret = 0; - - ret = dev_sigbus_handler_register(); - if (ret < 0) - RTE_LOG(ERR, EAL, - "fail to register sigbus handler for devices.\n"); - - hotplug_handle = true; - - return ret; -} - -int __rte_experimental -rte_dev_hotplug_handle_disable(void) -{ - int ret = 0; - - ret = dev_sigbus_handler_unregister(); - if (ret < 0) - RTE_LOG(ERR, EAL, - "fail to unregister sigbus handler for devices.\n"); - - hotplug_handle = false; - - return ret; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c deleted file mode 100644 index 0eab1cf719..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ /dev/null @@ -1,526 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* for hugetlb-related flags */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "rte_string_fns.h" -#include "eal_internal_cfg.h" -#include "eal_hugepages.h" -#include "eal_filesystem.h" - -static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; -static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; - -/* - * Uses mmap to create a shared memory area for storage of data - * Used in this file to store the hugepage file map on disk - */ -static void * -map_shared_memory(const char *filename, const size_t mem_size, int flags) -{ - void *retval; - int fd = open(filename, flags, 0666); - if (fd < 0) - return NULL; - if (ftruncate(fd, mem_size) < 0) { - close(fd); - return NULL; - } - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - close(fd); - return retval; -} - -static void * -open_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR); -} - -static void * -create_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); -} - -/* this function is only called from eal_hugepage_info_init which itself - * is only called from a primary process */ -static uint32_t -get_num_hugepages(const char *subdir) -{ - char path[PATH_MAX]; - long unsigned resv_pages, num_pages = 0; - const char *nr_hp_file = "free_hugepages"; - const char *nr_rsvd_file = "resv_hugepages"; - - /* first, check how many reserved pages kernel reports */ - snprintf(path, sizeof(path), "%s/%s/%s", - sys_dir_path, subdir, nr_rsvd_file); - if (eal_parse_sysfs_value(path, &resv_pages) < 0) - return 0; - - snprintf(path, sizeof(path), "%s/%s/%s", - sys_dir_path, subdir, nr_hp_file); - if (eal_parse_sysfs_value(path, &num_pages) < 0) - return 0; - - if (num_pages == 0) - RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", - subdir); - - /* adjust num_pages */ - if (num_pages >= resv_pages) - num_pages -= resv_pages; - else if (resv_pages) - num_pages = 0; - - /* we want to return a uint32_t and more than this looks suspicious - * anyway ... */ - if (num_pages > UINT32_MAX) - num_pages = UINT32_MAX; - - return num_pages; -} - -static uint32_t -get_num_hugepages_on_node(const char *subdir, unsigned int socket) -{ - char path[PATH_MAX], socketpath[PATH_MAX]; - DIR *socketdir; - unsigned long num_pages = 0; - const char *nr_hp_file = "free_hugepages"; - - snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", - sys_pages_numa_dir_path, socket); - - socketdir = opendir(socketpath); - if (socketdir) { - /* Keep calm and carry on */ - closedir(socketdir); - } else { - /* Can't find socket dir, so ignore it */ - return 0; - } - - snprintf(path, sizeof(path), "%s/%s/%s", - socketpath, subdir, nr_hp_file); - if (eal_parse_sysfs_value(path, &num_pages) < 0) - return 0; - - if (num_pages == 0) - RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", - subdir); - - /* - * we want to return a uint32_t and more than this looks suspicious - * anyway ... - */ - if (num_pages > UINT32_MAX) - num_pages = UINT32_MAX; - - return num_pages; -} - -static uint64_t -get_default_hp_size(void) -{ - const char proc_meminfo[] = "/proc/meminfo"; - const char str_hugepagesz[] = "Hugepagesize:"; - unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; - char buffer[256]; - unsigned long long size = 0; - - FILE *fd = fopen(proc_meminfo, "r"); - if (fd == NULL) - rte_panic("Cannot open %s\n", proc_meminfo); - while(fgets(buffer, sizeof(buffer), fd)){ - if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ - size = rte_str_to_size(&buffer[hugepagesz_len]); - break; - } - } - fclose(fd); - if (size == 0) - rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); - return size; -} - -static int -get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) -{ - enum proc_mount_fieldnames { - DEVICE = 0, - MOUNTPT, - FSTYPE, - OPTIONS, - _FIELDNAME_MAX - }; - static uint64_t default_size = 0; - const char proc_mounts[] = "/proc/mounts"; - const char hugetlbfs_str[] = "hugetlbfs"; - const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; - const char pagesize_opt[] = "pagesize="; - const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; - const char split_tok = ' '; - char *splitstr[_FIELDNAME_MAX]; - char buf[BUFSIZ]; - int retval = -1; - - FILE *fd = fopen(proc_mounts, "r"); - if (fd == NULL) - rte_panic("Cannot open %s\n", proc_mounts); - - if (default_size == 0) - default_size = get_default_hp_size(); - - while (fgets(buf, sizeof(buf), fd)){ - if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, - split_tok) != _FIELDNAME_MAX) { - RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); - break; /* return NULL */ - } - - /* we have a specified --huge-dir option, only examine that dir */ - if (internal_config.hugepage_dir != NULL && - strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) - continue; - - if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ - const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); - - /* if no explicit page size, the default page size is compared */ - if (pagesz_str == NULL){ - if (hugepage_sz == default_size){ - strlcpy(hugedir, splitstr[MOUNTPT], len); - retval = 0; - break; - } - } - /* there is an explicit page size, so check it */ - else { - uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); - if (pagesz == hugepage_sz) { - strlcpy(hugedir, splitstr[MOUNTPT], len); - retval = 0; - break; - } - } - } /* end if strncmp hugetlbfs */ - } /* end while fgets */ - - fclose(fd); - return retval; -} - -/* - * Clear the hugepage directory of whatever hugepage files - * there are. Checks if the file is locked (i.e. - * if it's in use by another DPDK process). - */ -static int -clear_hugedir(const char * hugedir) -{ - DIR *dir; - struct dirent *dirent; - int dir_fd, fd, lck_result; - const char filter[] = "*map_*"; /* matches hugepage files */ - - /* open directory */ - dir = opendir(hugedir); - if (!dir) { - RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", - hugedir); - goto error; - } - dir_fd = dirfd(dir); - - dirent = readdir(dir); - if (!dirent) { - RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", - hugedir); - goto error; - } - - while(dirent != NULL){ - /* skip files that don't match the hugepage pattern */ - if (fnmatch(filter, dirent->d_name, 0) > 0) { - dirent = readdir(dir); - continue; - } - - /* try and lock the file */ - fd = openat(dir_fd, dirent->d_name, O_RDONLY); - - /* skip to next file */ - if (fd == -1) { - dirent = readdir(dir); - continue; - } - - /* non-blocking lock */ - lck_result = flock(fd, LOCK_EX | LOCK_NB); - - /* if lock succeeds, remove the file */ - if (lck_result != -1) - unlinkat(dir_fd, dirent->d_name, 0); - close (fd); - dirent = readdir(dir); - } - - closedir(dir); - return 0; - -error: - if (dir) - closedir(dir); - - RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", - strerror(errno)); - - return -1; -} - -static int -compare_hpi(const void *a, const void *b) -{ - const struct hugepage_info *hpi_a = a; - const struct hugepage_info *hpi_b = b; - - return hpi_b->hugepage_sz - hpi_a->hugepage_sz; -} - -static void -calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) -{ - uint64_t total_pages = 0; - unsigned int i; - - /* - * first, try to put all hugepages into relevant sockets, but - * if first attempts fails, fall back to collecting all pages - * in one socket and sorting them later - */ - total_pages = 0; - /* we also don't want to do this for legacy init */ - if (!internal_config.legacy_mem) - for (i = 0; i < rte_socket_count(); i++) { - int socket = rte_socket_id_by_idx(i); - unsigned int num_pages = - get_num_hugepages_on_node( - dirent->d_name, socket); - hpi->num_pages[socket] = num_pages; - total_pages += num_pages; - } - /* - * we failed to sort memory from the get go, so fall - * back to old way - */ - if (total_pages == 0) { - hpi->num_pages[0] = get_num_hugepages(dirent->d_name); - -#ifndef RTE_ARCH_64 - /* for 32-bit systems, limit number of hugepages to - * 1GB per page size */ - hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], - RTE_PGSIZE_1G / hpi->hugepage_sz); -#endif - } -} - -static int -hugepage_info_init(void) -{ const char dirent_start_text[] = "hugepages-"; - const size_t dirent_start_len = sizeof(dirent_start_text) - 1; - unsigned int i, num_sizes = 0; - DIR *dir; - struct dirent *dirent; - - dir = opendir(sys_dir_path); - if (dir == NULL) { - RTE_LOG(ERR, EAL, - "Cannot open directory %s to read system hugepage info\n", - sys_dir_path); - return -1; - } - - for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { - struct hugepage_info *hpi; - - if (strncmp(dirent->d_name, dirent_start_text, - dirent_start_len) != 0) - continue; - - if (num_sizes >= MAX_HUGEPAGE_SIZES) - break; - - hpi = &internal_config.hugepage_info[num_sizes]; - hpi->hugepage_sz = - rte_str_to_size(&dirent->d_name[dirent_start_len]); - - /* first, check if we have a mountpoint */ - if (get_hugepage_dir(hpi->hugepage_sz, - hpi->hugedir, sizeof(hpi->hugedir)) < 0) { - uint32_t num_pages; - - num_pages = get_num_hugepages(dirent->d_name); - if (num_pages > 0) - RTE_LOG(NOTICE, EAL, - "%" PRIu32 " hugepages of size " - "%" PRIu64 " reserved, but no mounted " - "hugetlbfs found for that size\n", - num_pages, hpi->hugepage_sz); - /* if we have kernel support for reserving hugepages - * through mmap, and we're in in-memory mode, treat this - * page size as valid. we cannot be in legacy mode at - * this point because we've checked this earlier in the - * init process. - */ -#ifdef MAP_HUGE_SHIFT - if (internal_config.in_memory) { - RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " - "hugepages of size %" PRIu64 " bytes " - "will be allocated anonymously\n", - hpi->hugepage_sz); - calc_num_pages(hpi, dirent); - num_sizes++; - } -#endif - continue; - } - - /* try to obtain a writelock */ - hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); - - /* if blocking lock failed */ - if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { - RTE_LOG(CRIT, EAL, - "Failed to lock hugepage directory!\n"); - break; - } - /* clear out the hugepages dir from unused pages */ - if (clear_hugedir(hpi->hugedir) == -1) - break; - - calc_num_pages(hpi, dirent); - - num_sizes++; - } - closedir(dir); - - /* something went wrong, and we broke from the for loop above */ - if (dirent != NULL) - return -1; - - internal_config.num_hugepage_sizes = num_sizes; - - /* sort the page directory entries by size, largest to smallest */ - qsort(&internal_config.hugepage_info[0], num_sizes, - sizeof(internal_config.hugepage_info[0]), compare_hpi); - - /* now we have all info, check we have at least one valid size */ - for (i = 0; i < num_sizes; i++) { - /* pages may no longer all be on socket 0, so check all */ - unsigned int j, num_pages = 0; - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) - num_pages += hpi->num_pages[j]; - if (num_pages > 0) - return 0; - } - - /* no valid hugepage mounts available, return error */ - return -1; -} - -/* - * when we initialize the hugepage info, everything goes - * to socket 0 by default. it will later get sorted by memory - * initialization procedure. - */ -int -eal_hugepage_info_init(void) -{ - struct hugepage_info *hpi, *tmp_hpi; - unsigned int i; - - if (hugepage_info_init() < 0) - return -1; - - /* for no shared files mode, we're done */ - if (internal_config.no_shconf) - return 0; - - hpi = &internal_config.hugepage_info[0]; - - tmp_hpi = create_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL) { - RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); - return -1; - } - - memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); - - /* we've copied file descriptors along with everything else, but they - * will be invalid in secondary process, so overwrite them - */ - for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { - struct hugepage_info *tmp = &tmp_hpi[i]; - tmp->lock_descriptor = -1; - } - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - return 0; -} - -int eal_hugepage_info_read(void) -{ - struct hugepage_info *hpi = &internal_config.hugepage_info[0]; - struct hugepage_info *tmp_hpi; - - tmp_hpi = open_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL) { - RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); - return -1; - } - - memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c deleted file mode 100644 index cbac451e11..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c +++ /dev/null @@ -1,1326 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_vfio.h" -#include "eal_thread.h" - -#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) -#define NB_OTHER_INTR 1 - -static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ - -/** - * union for pipe fds. - */ -union intr_pipefds{ - struct { - int pipefd[2]; - }; - struct { - int readfd; - int writefd; - }; -}; - -/** - * union buffer for reading on different devices - */ -union rte_intr_read_buffer { - int uio_intr_count; /* for uio device */ -#ifdef VFIO_PRESENT - uint64_t vfio_intr_count; /* for vfio device */ -#endif - uint64_t timerfd_num; /* for timerfd */ - char charbuf[16]; /* for others */ -}; - -TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); -TAILQ_HEAD(rte_intr_source_list, rte_intr_source); - -struct rte_intr_callback { - TAILQ_ENTRY(rte_intr_callback) next; - rte_intr_callback_fn cb_fn; /**< callback address */ - void *cb_arg; /**< parameter for callback */ -}; - -struct rte_intr_source { - TAILQ_ENTRY(rte_intr_source) next; - struct rte_intr_handle intr_handle; /**< interrupt handle */ - struct rte_intr_cb_list callbacks; /**< user callbacks */ - uint32_t active; -}; - -/* global spinlock for interrupt data operation */ -static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; - -/* union buffer for pipe read/write */ -static union intr_pipefds intr_pipe; - -/* interrupt sources list */ -static struct rte_intr_source_list intr_sources; - -/* interrupt handling thread */ -static pthread_t intr_thread; - -/* VFIO interrupts */ -#ifdef VFIO_PRESENT - -#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) -/* irq set buffer length for queue interrupts and LSC interrupt */ -#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ - sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) - -/* enable legacy (INTx) interrupts */ -static int -vfio_enable_intx(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - /* enable INTx */ - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - /* unmask INTx after enabling */ - memset(irq_set, 0, len); - len = sizeof(struct vfio_irq_set); - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - return 0; -} - -/* disable legacy (INTx) interrupts */ -static int -vfio_disable_intx(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - /* mask interrupts before disabling */ - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - /* disable INTx*/ - memset(irq_set, 0, len); - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, - "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); - return -1; - } - return 0; -} - -/* enable MSI interrupts */ -static int -vfio_enable_msi(const struct rte_intr_handle *intr_handle) { - int len, ret; - char irq_set_buf[IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - return 0; -} - -/* disable MSI interrupts */ -static int -vfio_disable_msi(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, - "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); - - return ret; -} - -/* enable MSI-X interrupts */ -static int -vfio_enable_msix(const struct rte_intr_handle *intr_handle) { - int len, ret; - char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ - irq_set->count = intr_handle->max_intr ? - (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? - RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - /* INTR vector offset 0 reserve for non-efds mapping */ - fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; - memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, - sizeof(*intr_handle->efds) * intr_handle->nb_efd); - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -/* disable MSI-X interrupts */ -static int -vfio_disable_msix(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, - "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); - - return ret; -} - -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE -/* enable req notifier */ -static int -vfio_enable_req(const struct rte_intr_handle *intr_handle) -{ - int len, ret; - char irq_set_buf[IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | - VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -/* disable req notifier */ -static int -vfio_disable_req(const struct rte_intr_handle *intr_handle) -{ - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", - intr_handle->fd); - - return ret; -} -#endif -#endif - -static int -uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) -{ - unsigned char command_high; - - /* use UIO config file descriptor for uio_pci_generic */ - if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error reading interrupts status for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - /* disable interrupts */ - command_high |= 0x4; - if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error disabling interrupts for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - - return 0; -} - -static int -uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) -{ - unsigned char command_high; - - /* use UIO config file descriptor for uio_pci_generic */ - if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error reading interrupts status for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - /* enable interrupts */ - command_high &= ~0x4; - if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error enabling interrupts for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - - return 0; -} - -static int -uio_intr_disable(const struct rte_intr_handle *intr_handle) -{ - const int value = 0; - - if (write(intr_handle->fd, &value, sizeof(value)) < 0) { - RTE_LOG(ERR, EAL, - "Error disabling interrupts for fd %d (%s)\n", - intr_handle->fd, strerror(errno)); - return -1; - } - return 0; -} - -static int -uio_intr_enable(const struct rte_intr_handle *intr_handle) -{ - const int value = 1; - - if (write(intr_handle->fd, &value, sizeof(value)) < 0) { - RTE_LOG(ERR, EAL, - "Error enabling interrupts for fd %d (%s)\n", - intr_handle->fd, strerror(errno)); - return -1; - } - return 0; -} - -int -rte_intr_callback_register(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb, void *cb_arg) -{ - int ret, wake_thread; - struct rte_intr_source *src; - struct rte_intr_callback *callback; - - wake_thread = 0; - - /* first do parameter checking */ - if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { - RTE_LOG(ERR, EAL, - "Registering with invalid input parameter\n"); - return -EINVAL; - } - - /* allocate a new interrupt callback entity */ - callback = calloc(1, sizeof(*callback)); - if (callback == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - return -ENOMEM; - } - callback->cb_fn = cb; - callback->cb_arg = cb_arg; - - rte_spinlock_lock(&intr_lock); - - /* check if there is at least one callback registered for the fd */ - TAILQ_FOREACH(src, &intr_sources, next) { - if (src->intr_handle.fd == intr_handle->fd) { - /* we had no interrupts for this */ - if (TAILQ_EMPTY(&src->callbacks)) - wake_thread = 1; - - TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); - ret = 0; - break; - } - } - - /* no existing callbacks for this - add new source */ - if (src == NULL) { - src = calloc(1, sizeof(*src)); - if (src == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - free(callback); - ret = -ENOMEM; - } else { - src->intr_handle = *intr_handle; - TAILQ_INIT(&src->callbacks); - TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); - TAILQ_INSERT_TAIL(&intr_sources, src, next); - wake_thread = 1; - ret = 0; - } - } - - rte_spinlock_unlock(&intr_lock); - - /** - * check if need to notify the pipe fd waited by epoll_wait to - * rebuild the wait list. - */ - if (wake_thread) - if (write(intr_pipe.writefd, "1", 1) < 0) - return -EPIPE; - - return ret; -} - -int -rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb_fn, void *cb_arg) -{ - int ret; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - - /* do parameter checking first */ - if (intr_handle == NULL || intr_handle->fd < 0) { - RTE_LOG(ERR, EAL, - "Unregistering with invalid input parameter\n"); - return -EINVAL; - } - - rte_spinlock_lock(&intr_lock); - - /* check if the insterrupt source for the fd is existent */ - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == intr_handle->fd) - break; - - /* No interrupt source registered for the fd */ - if (src == NULL) { - ret = -ENOENT; - - /* interrupt source has some active callbacks right now. */ - } else if (src->active != 0) { - ret = -EAGAIN; - - /* ok to remove. */ - } else { - ret = 0; - - /*walk through the callbacks and remove all that match. */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - - next = TAILQ_NEXT(cb, next); - - if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || - cb->cb_arg == cb_arg)) { - TAILQ_REMOVE(&src->callbacks, cb, next); - free(cb); - ret++; - } - } - - /* all callbacks for that source are removed. */ - if (TAILQ_EMPTY(&src->callbacks)) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - } - - rte_spinlock_unlock(&intr_lock); - - /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ - if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { - ret = -EPIPE; - } - - return ret; -} - -int -rte_intr_enable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type){ - /* write to the uio fd to enable the interrupt */ - case RTE_INTR_HANDLE_UIO: - if (uio_intr_enable(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_UIO_INTX: - if (uio_intx_intr_enable(intr_handle)) - return -1; - break; - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - if (vfio_enable_msix(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_MSI: - if (vfio_enable_msi(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_LEGACY: - if (vfio_enable_intx(intr_handle)) - return -1; - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - if (vfio_enable_req(intr_handle)) - return -1; - break; -#endif -#endif - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -int -rte_intr_disable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type){ - /* write to the uio fd to disable the interrupt */ - case RTE_INTR_HANDLE_UIO: - if (uio_intr_disable(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_UIO_INTX: - if (uio_intx_intr_disable(intr_handle)) - return -1; - break; - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - if (vfio_disable_msix(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_MSI: - if (vfio_disable_msi(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_LEGACY: - if (vfio_disable_intx(intr_handle)) - return -1; - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - if (vfio_disable_req(intr_handle)) - return -1; - break; -#endif -#endif - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -static int -eal_intr_process_interrupts(struct epoll_event *events, int nfds) -{ - bool call = false; - int n, bytes_read; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - union rte_intr_read_buffer buf; - struct rte_intr_callback active_cb; - - for (n = 0; n < nfds; n++) { - - /** - * if the pipe fd is ready to read, return out to - * rebuild the wait list. - */ - if (events[n].data.fd == intr_pipe.readfd){ - int r = read(intr_pipe.readfd, buf.charbuf, - sizeof(buf.charbuf)); - RTE_SET_USED(r); - return -1; - } - rte_spinlock_lock(&intr_lock); - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == - events[n].data.fd) - break; - if (src == NULL){ - rte_spinlock_unlock(&intr_lock); - continue; - } - - /* mark this interrupt source as active and release the lock. */ - src->active = 1; - rte_spinlock_unlock(&intr_lock); - - /* set the length to be read dor different handle type */ - switch (src->intr_handle.type) { - case RTE_INTR_HANDLE_UIO: - case RTE_INTR_HANDLE_UIO_INTX: - bytes_read = sizeof(buf.uio_intr_count); - break; - case RTE_INTR_HANDLE_ALARM: - bytes_read = sizeof(buf.timerfd_num); - break; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - case RTE_INTR_HANDLE_VFIO_MSI: - case RTE_INTR_HANDLE_VFIO_LEGACY: - bytes_read = sizeof(buf.vfio_intr_count); - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - bytes_read = 0; - call = true; - break; -#endif -#endif - case RTE_INTR_HANDLE_VDEV: - case RTE_INTR_HANDLE_EXT: - bytes_read = 0; - call = true; - break; - case RTE_INTR_HANDLE_DEV_EVENT: - bytes_read = 0; - call = true; - break; - default: - bytes_read = 1; - break; - } - - if (bytes_read > 0) { - /** - * read out to clear the ready-to-be-read flag - * for epoll_wait. - */ - bytes_read = read(events[n].data.fd, &buf, bytes_read); - if (bytes_read < 0) { - if (errno == EINTR || errno == EWOULDBLOCK) - continue; - - RTE_LOG(ERR, EAL, "Error reading from file " - "descriptor %d: %s\n", - events[n].data.fd, - strerror(errno)); - /* - * The device is unplugged or buggy, remove - * it as an interrupt source and return to - * force the wait list to be rebuilt. - */ - rte_spinlock_lock(&intr_lock); - TAILQ_REMOVE(&intr_sources, src, next); - rte_spinlock_unlock(&intr_lock); - - for (cb = TAILQ_FIRST(&src->callbacks); cb; - cb = next) { - next = TAILQ_NEXT(cb, next); - TAILQ_REMOVE(&src->callbacks, cb, next); - free(cb); - } - free(src); - return -1; - } else if (bytes_read == 0) - RTE_LOG(ERR, EAL, "Read nothing from file " - "descriptor %d\n", events[n].data.fd); - else - call = true; - } - - /* grab a lock, again to call callbacks and update status. */ - rte_spinlock_lock(&intr_lock); - - if (call) { - - /* Finally, call all callbacks. */ - TAILQ_FOREACH(cb, &src->callbacks, next) { - - /* make a copy and unlock. */ - active_cb = *cb; - rte_spinlock_unlock(&intr_lock); - - /* call the actual callback */ - active_cb.cb_fn(active_cb.cb_arg); - - /*get the lock back. */ - rte_spinlock_lock(&intr_lock); - } - } - - /* we done with that interrupt source, release it. */ - src->active = 0; - rte_spinlock_unlock(&intr_lock); - } - - return 0; -} - -/** - * It handles all the interrupts. - * - * @param pfd - * epoll file descriptor. - * @param totalfds - * The number of file descriptors added in epoll. - * - * @return - * void - */ -static void -eal_intr_handle_interrupts(int pfd, unsigned totalfds) -{ - struct epoll_event events[totalfds]; - int nfds = 0; - - for(;;) { - nfds = epoll_wait(pfd, events, totalfds, - EAL_INTR_EPOLL_WAIT_FOREVER); - /* epoll_wait fail */ - if (nfds < 0) { - if (errno == EINTR) - continue; - RTE_LOG(ERR, EAL, - "epoll_wait returns with fail\n"); - return; - } - /* epoll_wait timeout, will never happens here */ - else if (nfds == 0) - continue; - /* epoll_wait has at least one fd ready to read */ - if (eal_intr_process_interrupts(events, nfds) < 0) - return; - } -} - -/** - * It builds/rebuilds up the epoll file descriptor with all the - * file descriptors being waited on. Then handles the interrupts. - * - * @param arg - * pointer. (unused) - * - * @return - * never return; - */ -static __attribute__((noreturn)) void * -eal_intr_thread_main(__rte_unused void *arg) -{ - struct epoll_event ev; - - /* host thread, never break out */ - for (;;) { - /* build up the epoll fd with all descriptors we are to - * wait on then pass it to the handle_interrupts function - */ - static struct epoll_event pipe_event = { - .events = EPOLLIN | EPOLLPRI, - }; - struct rte_intr_source *src; - unsigned numfds = 0; - - /* create epoll fd */ - int pfd = epoll_create(1); - if (pfd < 0) - rte_panic("Cannot create epoll instance\n"); - - pipe_event.data.fd = intr_pipe.readfd; - /** - * add pipe fd into wait list, this pipe is used to - * rebuild the wait list. - */ - if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, - &pipe_event) < 0) { - rte_panic("Error adding fd to %d epoll_ctl, %s\n", - intr_pipe.readfd, strerror(errno)); - } - numfds++; - - rte_spinlock_lock(&intr_lock); - - TAILQ_FOREACH(src, &intr_sources, next) { - if (src->callbacks.tqh_first == NULL) - continue; /* skip those with no callbacks */ - ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; - ev.data.fd = src->intr_handle.fd; - - /** - * add all the uio device file descriptor - * into wait list. - */ - if (epoll_ctl(pfd, EPOLL_CTL_ADD, - src->intr_handle.fd, &ev) < 0){ - rte_panic("Error adding fd %d epoll_ctl, %s\n", - src->intr_handle.fd, strerror(errno)); - } - else - numfds++; - } - rte_spinlock_unlock(&intr_lock); - /* serve the interrupt */ - eal_intr_handle_interrupts(pfd, numfds); - - /** - * when we return, we need to rebuild the - * list of fds to monitor. - */ - close(pfd); - } -} - -int -rte_eal_intr_init(void) -{ - int ret = 0; - - /* init the global interrupt source head */ - TAILQ_INIT(&intr_sources); - - /** - * create a pipe which will be waited by epoll and notified to - * rebuild the wait list of epoll. - */ - if (pipe(intr_pipe.pipefd) < 0) { - rte_errno = errno; - return -1; - } - - /* create the host thread to wait/handle the interrupt */ - ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, - eal_intr_thread_main, NULL); - if (ret != 0) { - rte_errno = -ret; - RTE_LOG(ERR, EAL, - "Failed to create thread for interrupt handling\n"); - } - - return ret; -} - -static void -eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) -{ - union rte_intr_read_buffer buf; - int bytes_read = 0; - int nbytes; - - switch (intr_handle->type) { - case RTE_INTR_HANDLE_UIO: - case RTE_INTR_HANDLE_UIO_INTX: - bytes_read = sizeof(buf.uio_intr_count); - break; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - case RTE_INTR_HANDLE_VFIO_MSI: - case RTE_INTR_HANDLE_VFIO_LEGACY: - bytes_read = sizeof(buf.vfio_intr_count); - break; -#endif - case RTE_INTR_HANDLE_VDEV: - bytes_read = intr_handle->efd_counter_size; - /* For vdev, number of bytes to read is set by driver */ - break; - case RTE_INTR_HANDLE_EXT: - return; - default: - bytes_read = 1; - RTE_LOG(INFO, EAL, "unexpected intr type\n"); - break; - } - - /** - * read out to clear the ready-to-be-read flag - * for epoll_wait. - */ - if (bytes_read == 0) - return; - do { - nbytes = read(fd, &buf, bytes_read); - if (nbytes < 0) { - if (errno == EINTR || errno == EWOULDBLOCK || - errno == EAGAIN) - continue; - RTE_LOG(ERR, EAL, - "Error reading from fd %d: %s\n", - fd, strerror(errno)); - } else if (nbytes == 0) - RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); - return; - } while (1); -} - -static int -eal_epoll_process_event(struct epoll_event *evs, unsigned int n, - struct rte_epoll_event *events) -{ - unsigned int i, count = 0; - struct rte_epoll_event *rev; - - for (i = 0; i < n; i++) { - rev = evs[i].data.ptr; - if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, - RTE_EPOLL_EXEC)) - continue; - - events[count].status = RTE_EPOLL_VALID; - events[count].fd = rev->fd; - events[count].epfd = rev->epfd; - events[count].epdata.event = rev->epdata.event; - events[count].epdata.data = rev->epdata.data; - if (rev->epdata.cb_fun) - rev->epdata.cb_fun(rev->fd, - rev->epdata.cb_arg); - - rte_compiler_barrier(); - rev->status = RTE_EPOLL_VALID; - count++; - } - return count; -} - -static inline int -eal_init_tls_epfd(void) -{ - int pfd = epoll_create(255); - - if (pfd < 0) { - RTE_LOG(ERR, EAL, - "Cannot create epoll instance\n"); - return -1; - } - return pfd; -} - -int -rte_intr_tls_epfd(void) -{ - if (RTE_PER_LCORE(_epfd) == -1) - RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); - - return RTE_PER_LCORE(_epfd); -} - -int -rte_epoll_wait(int epfd, struct rte_epoll_event *events, - int maxevents, int timeout) -{ - struct epoll_event evs[maxevents]; - int rc; - - if (!events) { - RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); - return -1; - } - - /* using per thread epoll fd */ - if (epfd == RTE_EPOLL_PER_THREAD) - epfd = rte_intr_tls_epfd(); - - while (1) { - rc = epoll_wait(epfd, evs, maxevents, timeout); - if (likely(rc > 0)) { - /* epoll_wait has at least one fd ready to read */ - rc = eal_epoll_process_event(evs, rc, events); - break; - } else if (rc < 0) { - if (errno == EINTR) - continue; - /* epoll_wait fail */ - RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", - strerror(errno)); - rc = -1; - break; - } else { - /* rc == 0, epoll_wait timed out */ - break; - } - } - - return rc; -} - -static inline void -eal_epoll_data_safe_free(struct rte_epoll_event *ev) -{ - while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, - RTE_EPOLL_INVALID)) - while (ev->status != RTE_EPOLL_VALID) - rte_pause(); - memset(&ev->epdata, 0, sizeof(ev->epdata)); - ev->fd = -1; - ev->epfd = -1; -} - -int -rte_epoll_ctl(int epfd, int op, int fd, - struct rte_epoll_event *event) -{ - struct epoll_event ev; - - if (!event) { - RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); - return -1; - } - - /* using per thread epoll fd */ - if (epfd == RTE_EPOLL_PER_THREAD) - epfd = rte_intr_tls_epfd(); - - if (op == EPOLL_CTL_ADD) { - event->status = RTE_EPOLL_VALID; - event->fd = fd; /* ignore fd in event */ - event->epfd = epfd; - ev.data.ptr = (void *)event; - } - - ev.events = event->epdata.event; - if (epoll_ctl(epfd, op, fd, &ev) < 0) { - RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", - op, fd, strerror(errno)); - if (op == EPOLL_CTL_ADD) - /* rollback status when CTL_ADD fail */ - event->status = RTE_EPOLL_INVALID; - return -1; - } - - if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) - eal_epoll_data_safe_free(event); - - return 0; -} - -int -rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, - int op, unsigned int vec, void *data) -{ - struct rte_epoll_event *rev; - struct rte_epoll_data *epdata; - int epfd_op; - unsigned int efd_idx; - int rc = 0; - - efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? - (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; - - if (!intr_handle || intr_handle->nb_efd == 0 || - efd_idx >= intr_handle->nb_efd) { - RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); - return -EPERM; - } - - switch (op) { - case RTE_INTR_EVENT_ADD: - epfd_op = EPOLL_CTL_ADD; - rev = &intr_handle->elist[efd_idx]; - if (rev->status != RTE_EPOLL_INVALID) { - RTE_LOG(INFO, EAL, "Event already been added.\n"); - return -EEXIST; - } - - /* attach to intr vector fd */ - epdata = &rev->epdata; - epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; - epdata->data = data; - epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; - epdata->cb_arg = (void *)intr_handle; - rc = rte_epoll_ctl(epfd, epfd_op, - intr_handle->efds[efd_idx], rev); - if (!rc) - RTE_LOG(DEBUG, EAL, - "efd %d associated with vec %d added on epfd %d" - "\n", rev->fd, vec, epfd); - else - rc = -EPERM; - break; - case RTE_INTR_EVENT_DEL: - epfd_op = EPOLL_CTL_DEL; - rev = &intr_handle->elist[efd_idx]; - if (rev->status == RTE_EPOLL_INVALID) { - RTE_LOG(INFO, EAL, "Event does not exist.\n"); - return -EPERM; - } - - rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); - if (rc) - rc = -EPERM; - break; - default: - RTE_LOG(ERR, EAL, "event op type mismatch\n"); - rc = -EPERM; - } - - return rc; -} - -void -rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) -{ - uint32_t i; - struct rte_epoll_event *rev; - - for (i = 0; i < intr_handle->nb_efd; i++) { - rev = &intr_handle->elist[i]; - if (rev->status == RTE_EPOLL_INVALID) - continue; - if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { - /* force free if the entry valid */ - eal_epoll_data_safe_free(rev); - rev->status = RTE_EPOLL_INVALID; - } - } -} - -int -rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) -{ - uint32_t i; - int fd; - uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - - assert(nb_efd != 0); - - if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { - for (i = 0; i < n; i++) { - fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - if (fd < 0) { - RTE_LOG(ERR, EAL, - "can't setup eventfd, error %i (%s)\n", - errno, strerror(errno)); - return -errno; - } - intr_handle->efds[i] = fd; - } - intr_handle->nb_efd = n; - intr_handle->max_intr = NB_OTHER_INTR + n; - } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { - /* only check, initialization would be done in vdev driver.*/ - if (intr_handle->efd_counter_size > - sizeof(union rte_intr_read_buffer)) { - RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); - return -EINVAL; - } - } else { - intr_handle->efds[0] = intr_handle->fd; - intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); - intr_handle->max_intr = NB_OTHER_INTR; - } - - return 0; -} - -void -rte_intr_efd_disable(struct rte_intr_handle *intr_handle) -{ - uint32_t i; - - rte_intr_free_epoll_fd(intr_handle); - if (intr_handle->max_intr > intr_handle->nb_efd) { - for (i = 0; i < intr_handle->nb_efd; i++) - close(intr_handle->efds[i]); - } - intr_handle->nb_efd = 0; - intr_handle->max_intr = 0; -} - -int -rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) -{ - return !(!intr_handle->nb_efd); -} - -int -rte_intr_allow_others(struct rte_intr_handle *intr_handle) -{ - if (!rte_intr_dp_is_en(intr_handle)) - return 1; - else - return !!(intr_handle->max_intr - intr_handle->nb_efd); -} - -int -rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) -{ - if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) - return 1; - - if (intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 1; - - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_lcore.c b/lib/librte_eal/linuxapp/eal/eal_lcore.c deleted file mode 100644 index bc8965844c..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_lcore.c +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_filesystem.h" -#include "eal_thread.h" - -#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" -#define CORE_ID_FILE "topology/core_id" -#define NUMA_NODE_PATH "/sys/devices/system/node" - -/* Check if a cpu is present by the presence of the cpu information for it */ -int -eal_cpu_detected(unsigned lcore_id) -{ - char path[PATH_MAX]; - int len = snprintf(path, sizeof(path), SYS_CPU_DIR - "/"CORE_ID_FILE, lcore_id); - if (len <= 0 || (unsigned)len >= sizeof(path)) - return 0; - if (access(path, F_OK) != 0) - return 0; - - return 1; -} - -/* - * Get CPU socket id (NUMA node) for a logical core. - * - * This searches each nodeX directories in /sys for the symlink for the given - * lcore_id and returns the numa node where the lcore is found. If lcore is not - * found on any numa node, returns zero. - */ -unsigned -eal_cpu_socket_id(unsigned lcore_id) -{ - unsigned socket; - - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { - char path[PATH_MAX]; - - snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, - socket, lcore_id); - if (access(path, F_OK) == 0) - return socket; - } - return 0; -} - -/* Get the cpu core id value from the /sys/.../cpuX core_id value */ -unsigned -eal_cpu_core_id(unsigned lcore_id) -{ - char path[PATH_MAX]; - unsigned long id; - - int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); - if (len <= 0 || (unsigned)len >= sizeof(path)) - goto err; - if (eal_parse_sysfs_value(path, &id) != 0) - goto err; - return (unsigned)id; - -err: - RTE_LOG(ERR, EAL, "Error reading core id value from %s " - "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c deleted file mode 100644 index 9d02dddbed..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_log.c +++ /dev/null @@ -1,62 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" - -/* - * default log function - */ -static ssize_t -console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) -{ - ssize_t ret; - - /* write on stdout */ - ret = fwrite(buf, 1, size, stdout); - fflush(stdout); - - /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ - syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); - - return ret; -} - -static cookie_io_functions_t console_log_func = { - .write = console_log_write, -}; - -/* - * set the log to default function, called during eal init process, - * once memzones are available. - */ -int -rte_eal_log_init(const char *id, int facility) -{ - FILE *log_stream; - - log_stream = fopencookie(NULL, "w+", console_log_func); - if (log_stream == NULL) - return -1; - - openlog(id, LOG_NDELAY | LOG_PID, facility); - - eal_log_set_default(log_stream); - - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c deleted file mode 100644 index b6fb183db4..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c +++ /dev/null @@ -1,1685 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017-2018 Intel Corporation - */ - -#define _FILE_OFFSET_BITS 64 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ -#include -#define MEMFD_SUPPORTED -#endif -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -#include -#include -#endif -#include -#include /* for hugetlb-related mmap flags */ - -#include -#include -#include -#include -#include -#include -#include - -#include "eal_filesystem.h" -#include "eal_internal_cfg.h" -#include "eal_memalloc.h" -#include "eal_private.h" - -const int anonymous_hugepages_supported = -#ifdef MAP_HUGE_SHIFT - 1; -#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT -#else - 0; -#define RTE_MAP_HUGE_SHIFT 26 -#endif - -/* - * we've already checked memfd support at compile-time, but we also need to - * check if we can create hugepage files with memfd. - * - * also, this is not a constant, because while we may be *compiled* with memfd - * hugetlbfs support, we might not be *running* on a system that supports memfd - * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at - * runtime, and fall back to anonymous memory. - */ -static int memfd_create_supported = -#ifdef MFD_HUGETLB - 1; -#define RTE_MFD_HUGETLB MFD_HUGETLB -#else - 0; -#define RTE_MFD_HUGETLB 4U -#endif - -/* - * not all kernel version support fallocate on hugetlbfs, so fall back to - * ftruncate and disallow deallocation if fallocate is not supported. - */ -static int fallocate_supported = -1; /* unknown */ - -/* - * we have two modes - single file segments, and file-per-page mode. - * - * for single-file segments, we need some kind of mechanism to keep track of - * which hugepages can be freed back to the system, and which cannot. we cannot - * use flock() because they don't allow locking parts of a file, and we cannot - * use fcntl() due to issues with their semantics, so we will have to rely on a - * bunch of lockfiles for each page. so, we will use 'fds' array to keep track - * of per-page lockfiles. we will store the actual segment list fd in the - * 'memseg_list_fd' field. - * - * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' - * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. - * - * we cannot know how many pages a system will have in advance, but we do know - * that they come in lists, and we know lengths of these lists. so, simply store - * a malloc'd array of fd's indexed by list and segment index. - * - * they will be initialized at startup, and filled as we allocate/deallocate - * segments. - */ -static struct { - int *fds; /**< dynamically allocated array of segment lock fd's */ - int memseg_list_fd; /**< memseg list fd */ - int len; /**< total length of the array */ - int count; /**< entries used in an array */ -} fd_list[RTE_MAX_MEMSEG_LISTS]; - -/** local copy of a memory map, used to synchronize memory hotplug in MP */ -static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; - -static sigjmp_buf huge_jmpenv; - -static void __rte_unused huge_sigbus_handler(int signo __rte_unused) -{ - siglongjmp(huge_jmpenv, 1); -} - -/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, - * non-static local variable in the stack frame calling sigsetjmp might be - * clobbered by a call to longjmp. - */ -static int __rte_unused huge_wrap_sigsetjmp(void) -{ - return sigsetjmp(huge_jmpenv, 1); -} - -static struct sigaction huge_action_old; -static int huge_need_recover; - -static void __rte_unused -huge_register_sigbus(void) -{ - sigset_t mask; - struct sigaction action; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = 0; - action.sa_mask = mask; - action.sa_handler = huge_sigbus_handler; - - huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); -} - -static void __rte_unused -huge_recover_sigbus(void) -{ - if (huge_need_recover) { - sigaction(SIGBUS, &huge_action_old, NULL); - huge_need_recover = 0; - } -} - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -static bool -check_numa(void) -{ - bool ret = true; - /* Check if kernel supports NUMA. */ - if (numa_available() != 0) { - RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); - ret = false; - } - return ret; -} - -static void -prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) -{ - RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); - if (get_mempolicy(oldpolicy, oldmask->maskp, - oldmask->size + 1, 0, 0) < 0) { - RTE_LOG(ERR, EAL, - "Failed to get current mempolicy: %s. " - "Assuming MPOL_DEFAULT.\n", strerror(errno)); - *oldpolicy = MPOL_DEFAULT; - } - RTE_LOG(DEBUG, EAL, - "Setting policy MPOL_PREFERRED for socket %d\n", - socket_id); - numa_set_preferred(socket_id); -} - -static void -restore_numa(int *oldpolicy, struct bitmask *oldmask) -{ - RTE_LOG(DEBUG, EAL, - "Restoring previous memory policy: %d\n", *oldpolicy); - if (*oldpolicy == MPOL_DEFAULT) { - numa_set_localalloc(); - } else if (set_mempolicy(*oldpolicy, oldmask->maskp, - oldmask->size + 1) < 0) { - RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", - strerror(errno)); - numa_set_localalloc(); - } - numa_free_cpumask(oldmask); -} -#endif - -/* - * uses fstat to report the size of a file on disk - */ -static off_t -get_file_size(int fd) -{ - struct stat st; - if (fstat(fd, &st) < 0) - return 0; - return st.st_size; -} - -static int -pagesz_flags(uint64_t page_sz) -{ - /* as per mmap() manpage, all page sizes are log2 of page size - * shifted by MAP_HUGE_SHIFT - */ - int log2 = rte_log2_u64(page_sz); - return log2 << RTE_MAP_HUGE_SHIFT; -} - -/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ -static int lock(int fd, int type) -{ - int ret; - - /* flock may be interrupted */ - do { - ret = flock(fd, type | LOCK_NB); - } while (ret && errno == EINTR); - - if (ret && errno == EWOULDBLOCK) { - /* couldn't lock */ - return 0; - } else if (ret) { - RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", - __func__, strerror(errno)); - return -1; - } - /* lock was successful */ - return 1; -} - -static int get_segment_lock_fd(int list_idx, int seg_idx) -{ - char path[PATH_MAX] = {0}; - int fd; - - if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) - return -1; - if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) - return -1; - - fd = fd_list[list_idx].fds[seg_idx]; - /* does this lock already exist? */ - if (fd >= 0) - return fd; - - eal_get_hugefile_lock_path(path, sizeof(path), - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - - fd = open(path, O_CREAT | O_RDWR, 0660); - if (fd < 0) { - RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n", - __func__, path, strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) != 1) { - RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n", - __func__, path, strerror(errno)); - close(fd); - return -1; - } - /* store it for future reference */ - fd_list[list_idx].fds[seg_idx] = fd; - fd_list[list_idx].count++; - return fd; -} - -static int unlock_segment(int list_idx, int seg_idx) -{ - int fd, ret; - - if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list)) - return -1; - if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len) - return -1; - - fd = fd_list[list_idx].fds[seg_idx]; - - /* upgrade lock to exclusive to see if we can remove the lockfile */ - ret = lock(fd, LOCK_EX); - if (ret == 1) { - /* we've succeeded in taking exclusive lock, this lockfile may - * be removed. - */ - char path[PATH_MAX] = {0}; - eal_get_hugefile_lock_path(path, sizeof(path), - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - if (unlink(path)) { - RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n", - __func__, path, strerror(errno)); - } - } - /* we don't want to leak the fd, so even if we fail to lock, close fd - * and remove it from list anyway. - */ - close(fd); - fd_list[list_idx].fds[seg_idx] = -1; - fd_list[list_idx].count--; - - if (ret < 0) - return -1; - return 0; -} - -static int -get_seg_memfd(struct hugepage_info *hi __rte_unused, - unsigned int list_idx __rte_unused, - unsigned int seg_idx __rte_unused) -{ -#ifdef MEMFD_SUPPORTED - int fd; - char segname[250]; /* as per manpage, limit is 249 bytes plus null */ - - int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); - - if (internal_config.single_file_segments) { - fd = fd_list[list_idx].memseg_list_fd; - - if (fd < 0) { - snprintf(segname, sizeof(segname), "seg_%i", list_idx); - fd = memfd_create(segname, flags); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", - __func__, strerror(errno)); - return -1; - } - fd_list[list_idx].memseg_list_fd = fd; - } - } else { - fd = fd_list[list_idx].fds[seg_idx]; - - if (fd < 0) { - snprintf(segname, sizeof(segname), "seg_%i-%i", - list_idx, seg_idx); - fd = memfd_create(segname, flags); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", - __func__, strerror(errno)); - return -1; - } - fd_list[list_idx].fds[seg_idx] = fd; - } - } - return fd; -#endif - return -1; -} - -static int -get_seg_fd(char *path, int buflen, struct hugepage_info *hi, - unsigned int list_idx, unsigned int seg_idx) -{ - int fd; - - /* for in-memory mode, we only make it here when we're sure we support - * memfd, and this is a special case. - */ - if (internal_config.in_memory) - return get_seg_memfd(hi, list_idx, seg_idx); - - if (internal_config.single_file_segments) { - /* create a hugepage file path */ - eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); - - fd = fd_list[list_idx].memseg_list_fd; - - if (fd < 0) { - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", - __func__, strerror(errno)); - return -1; - } - /* take out a read lock and keep it indefinitely */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; - } - fd_list[list_idx].memseg_list_fd = fd; - } - } else { - /* create a hugepage file path */ - eal_get_hugefile_path(path, buflen, hi->hugedir, - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - - fd = fd_list[list_idx].fds[seg_idx]; - - if (fd < 0) { - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", - __func__, strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; - } - fd_list[list_idx].fds[seg_idx] = fd; - } - } - return fd; -} - -static int -resize_hugefile(int fd, char *path, int list_idx, int seg_idx, - uint64_t fa_offset, uint64_t page_sz, bool grow) -{ - bool again = false; - - /* in-memory mode is a special case, because we don't need to perform - * any locking, and we can be sure that fallocate() is supported. - */ - if (internal_config.in_memory) { - int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_KEEP_SIZE; - int ret; - - /* grow or shrink the file */ - ret = fallocate(fd, flags, fa_offset, page_sz); - - if (ret < 0) { - RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", - __func__, - strerror(errno)); - return -1; - } - /* increase/decrease total segment count */ - fd_list[list_idx].count += (grow ? 1 : -1); - if (!grow && fd_list[list_idx].count == 0) { - close(fd_list[list_idx].memseg_list_fd); - fd_list[list_idx].memseg_list_fd = -1; - } - return 0; - } - - do { - if (fallocate_supported == 0) { - /* we cannot deallocate memory if fallocate() is not - * supported, and hugepage file is already locked at - * creation, so no further synchronization needed. - */ - - if (!grow) { - RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", - __func__); - return -1; - } - uint64_t new_size = fa_offset + page_sz; - uint64_t cur_size = get_file_size(fd); - - /* fallocate isn't supported, fall back to ftruncate */ - if (new_size > cur_size && - ftruncate(fd, new_size) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - return -1; - } - } else { - int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_KEEP_SIZE; - int ret, lock_fd; - - /* if fallocate() is supported, we need to take out a - * read lock on allocate (to prevent other processes - * from deallocating this page), and take out a write - * lock on deallocate (to ensure nobody else is using - * this page). - * - * read locks on page itself are already taken out at - * file creation, in get_seg_fd(). - * - * we cannot rely on simple use of flock() call, because - * we need to be able to lock a section of the file, - * and we cannot use fcntl() locks, because of numerous - * problems with their semantics, so we will use - * deterministically named lock files for each section - * of the file. - * - * if we're shrinking the file, we want to upgrade our - * lock from shared to exclusive. - * - * lock_fd is an fd for a lockfile, not for the segment - * list. - */ - lock_fd = get_segment_lock_fd(list_idx, seg_idx); - - if (!grow) { - /* we are using this lockfile to determine - * whether this particular page is locked, as we - * are in single file segments mode and thus - * cannot use regular flock() to get this info. - * - * we want to try and take out an exclusive lock - * on the lock file to determine if we're the - * last ones using this page, and if not, we - * won't be shrinking it, and will instead exit - * prematurely. - */ - ret = lock(lock_fd, LOCK_EX); - - /* drop the lock on the lockfile, so that even - * if we couldn't shrink the file ourselves, we - * are signalling to other processes that we're - * no longer using this page. - */ - if (unlock_segment(list_idx, seg_idx)) - RTE_LOG(ERR, EAL, "Could not unlock segment\n"); - - /* additionally, if this was the last lock on - * this segment list, we can safely close the - * page file fd, so that one of the processes - * could then delete the file after shrinking. - */ - if (ret < 1 && fd_list[list_idx].count == 0) { - close(fd); - fd_list[list_idx].memseg_list_fd = -1; - } - - if (ret < 0) { - RTE_LOG(ERR, EAL, "Could not lock segment\n"); - return -1; - } - if (ret == 0) - /* failed to lock, not an error. */ - return 0; - } - - /* grow or shrink the file */ - ret = fallocate(fd, flags, fa_offset, page_sz); - - if (ret < 0) { - if (fallocate_supported == -1 && - errno == ENOTSUP) { - RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", - __func__); - again = true; - fallocate_supported = 0; - } else { - RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", - __func__, - strerror(errno)); - return -1; - } - } else { - fallocate_supported = 1; - - /* we've grew/shrunk the file, and we hold an - * exclusive lock now. check if there are no - * more segments active in this segment list, - * and remove the file if there aren't. - */ - if (fd_list[list_idx].count == 0) { - if (unlink(path)) - RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", - __func__, path, - strerror(errno)); - close(fd); - fd_list[list_idx].memseg_list_fd = -1; - } - } - } - } while (again); - return 0; -} - -static int -alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, - struct hugepage_info *hi, unsigned int list_idx, - unsigned int seg_idx) -{ -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - int cur_socket_id = 0; -#endif - uint64_t map_offset; - rte_iova_t iova; - void *va; - char path[PATH_MAX]; - int ret = 0; - int fd; - size_t alloc_sz; - int flags; - void *new_addr; - - alloc_sz = hi->hugepage_sz; - - /* these are checked at init, but code analyzers don't know that */ - if (internal_config.in_memory && !anonymous_hugepages_supported) { - RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); - return -1; - } - if (internal_config.in_memory && !memfd_create_supported && - internal_config.single_file_segments) { - RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); - return -1; - } - - /* in-memory without memfd is a special case */ - int mmap_flags; - - if (internal_config.in_memory && !memfd_create_supported) { - const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | - MAP_PRIVATE | MAP_ANONYMOUS; - int pagesz_flag; - - pagesz_flag = pagesz_flags(alloc_sz); - fd = -1; - mmap_flags = in_memory_flags | pagesz_flag; - - /* single-file segments codepath will never be active - * here because in-memory mode is incompatible with the - * fallback path, and it's stopped at EAL initialization - * stage. - */ - map_offset = 0; - } else { - /* takes out a read lock on segment or segment list */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); - return -1; - } - - if (internal_config.single_file_segments) { - map_offset = seg_idx * alloc_sz; - ret = resize_hugefile(fd, path, list_idx, seg_idx, - map_offset, alloc_sz, true); - if (ret < 0) - goto resized; - } else { - map_offset = 0; - if (ftruncate(fd, alloc_sz) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - if (internal_config.hugepage_unlink && - !internal_config.in_memory) { - if (unlink(path)) { - RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - } - } - mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; - } - - /* - * map the segment, and populate page tables, the kernel fills - * this segment with zeros if it's a new page. - */ - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, - map_offset); - - if (va == MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, - strerror(errno)); - /* mmap failed, but the previous region might have been - * unmapped anyway. try to remap it - */ - goto unmapped; - } - if (va != addr) { - RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); - munmap(va, alloc_sz); - goto resized; - } - - /* In linux, hugetlb limitations, like cgroup, are - * enforced at fault time instead of mmap(), even - * with the option of MAP_POPULATE. Kernel will send - * a SIGBUS signal. To avoid to be killed, save stack - * environment here, if SIGBUS happens, we can jump - * back here. - */ - if (huge_wrap_sigsetjmp()) { - RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", - (unsigned int)(alloc_sz >> 20)); - goto mapped; - } - - /* we need to trigger a write to the page to enforce page fault and - * ensure that page is accessible to us, but we can't overwrite value - * that is already there, so read the old value, and write itback. - * kernel populates the page with zeroes initially. - */ - *(volatile int *)addr = *(volatile int *)addr; - - iova = rte_mem_virt2iova(addr); - if (iova == RTE_BAD_PHYS_ADDR) { - RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", - __func__); - goto mapped; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0); - - if (cur_socket_id != socket_id) { - RTE_LOG(DEBUG, EAL, - "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", - __func__, socket_id, cur_socket_id); - goto mapped; - } -#endif - - ms->addr = addr; - ms->hugepage_sz = alloc_sz; - ms->len = alloc_sz; - ms->nchannel = rte_memory_get_nchannel(); - ms->nrank = rte_memory_get_nrank(); - ms->iova = iova; - ms->socket_id = socket_id; - - return 0; - -mapped: - munmap(addr, alloc_sz); -unmapped: - flags = MAP_FIXED; - new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); - if (new_addr != addr) { - if (new_addr != NULL) - munmap(new_addr, alloc_sz); - /* we're leaving a hole in our virtual address space. if - * somebody else maps this hole now, we could accidentally - * override it in the future. - */ - RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); - } -resized: - /* some codepaths will return negative fd, so exit early */ - if (fd < 0) - return -1; - - if (internal_config.single_file_segments) { - resize_hugefile(fd, path, list_idx, seg_idx, map_offset, - alloc_sz, false); - /* ignore failure, can't make it any worse */ - } else { - /* only remove file if we can take out a write lock */ - if (internal_config.hugepage_unlink == 0 && - internal_config.in_memory == 0 && - lock(fd, LOCK_EX) == 1) - unlink(path); - close(fd); - fd_list[list_idx].fds[seg_idx] = -1; - } - return -1; -} - -static int -free_seg(struct rte_memseg *ms, struct hugepage_info *hi, - unsigned int list_idx, unsigned int seg_idx) -{ - uint64_t map_offset; - char path[PATH_MAX]; - int fd, ret = 0; - bool exit_early; - - /* erase page data */ - memset(ms->addr, 0, ms->len); - - if (mmap(ms->addr, ms->len, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == - MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); - return -1; - } - - exit_early = false; - - /* if we're using anonymous hugepages, nothing to be done */ - if (internal_config.in_memory && !memfd_create_supported) - exit_early = true; - - /* if we've already unlinked the page, nothing needs to be done */ - if (!internal_config.in_memory && internal_config.hugepage_unlink) - exit_early = true; - - if (exit_early) { - memset(ms, 0, sizeof(*ms)); - return 0; - } - - /* if we are not in single file segments mode, we're going to unmap the - * segment and thus drop the lock on original fd, but hugepage dir is - * now locked so we can take out another one without races. - */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); - if (fd < 0) - return -1; - - if (internal_config.single_file_segments) { - map_offset = seg_idx * ms->len; - if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset, - ms->len, false)) - return -1; - ret = 0; - } else { - /* if we're able to take out a write lock, we're the last one - * holding onto this page. - */ - if (!internal_config.in_memory) { - ret = lock(fd, LOCK_EX); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) - unlink(path); - } - } - /* closing fd will drop the lock */ - close(fd); - fd_list[list_idx].fds[seg_idx] = -1; - } - - memset(ms, 0, sizeof(*ms)); - - return ret < 0 ? -1 : 0; -} - -struct alloc_walk_param { - struct hugepage_info *hi; - struct rte_memseg **ms; - size_t page_sz; - unsigned int segs_allocated; - unsigned int n_segs; - int socket; - bool exact; -}; -static int -alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct alloc_walk_param *wa = arg; - struct rte_memseg_list *cur_msl; - size_t page_sz; - int cur_idx, start_idx, j, dir_fd = -1; - unsigned int msl_idx, need, i; - - if (msl->page_sz != wa->page_sz) - return 0; - if (msl->socket_id != wa->socket) - return 0; - - page_sz = (size_t)msl->page_sz; - - msl_idx = msl - mcfg->memsegs; - cur_msl = &mcfg->memsegs[msl_idx]; - - need = wa->n_segs; - - /* try finding space in memseg list */ - cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need); - if (cur_idx < 0) - return 0; - start_idx = cur_idx; - - /* do not allow any page allocations during the time we're allocating, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - * - * during init, we already hold a write lock, so don't try to take out - * another one. - */ - if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { - dir_fd = open(wa->hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - } - - for (i = 0; i < need; i++, cur_idx++) { - struct rte_memseg *cur; - void *map_addr; - - cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); - map_addr = RTE_PTR_ADD(cur_msl->base_va, - cur_idx * page_sz); - - if (alloc_seg(cur, map_addr, wa->socket, wa->hi, - msl_idx, cur_idx)) { - RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", - need, i); - - /* if exact number wasn't requested, stop */ - if (!wa->exact) - goto out; - - /* clean up */ - for (j = start_idx; j < cur_idx; j++) { - struct rte_memseg *tmp; - struct rte_fbarray *arr = - &cur_msl->memseg_arr; - - tmp = rte_fbarray_get(arr, j); - rte_fbarray_set_free(arr, j); - - /* free_seg may attempt to create a file, which - * may fail. - */ - if (free_seg(tmp, wa->hi, msl_idx, j)) - RTE_LOG(DEBUG, EAL, "Cannot free page\n"); - } - /* clear the list */ - if (wa->ms) - memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); - - if (dir_fd >= 0) - close(dir_fd); - return -1; - } - if (wa->ms) - wa->ms[i] = cur; - - rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); - } -out: - wa->segs_allocated = i; - if (i > 0) - cur_msl->version++; - if (dir_fd >= 0) - close(dir_fd); - return 1; -} - -struct free_walk_param { - struct hugepage_info *hi; - struct rte_memseg *ms; -}; -static int -free_seg_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *found_msl; - struct free_walk_param *wa = arg; - uintptr_t start_addr, end_addr; - int msl_idx, seg_idx, ret, dir_fd = -1; - - start_addr = (uintptr_t) msl->base_va; - end_addr = start_addr + msl->len; - - if ((uintptr_t)wa->ms->addr < start_addr || - (uintptr_t)wa->ms->addr >= end_addr) - return 0; - - msl_idx = msl - mcfg->memsegs; - seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; - - /* msl is const */ - found_msl = &mcfg->memsegs[msl_idx]; - - /* do not allow any page allocations during the time we're freeing, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - * - * during init, we already hold a write lock, so don't try to take out - * another one. - */ - if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { - dir_fd = open(wa->hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - } - - found_msl->version++; - - rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); - - ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); - - if (dir_fd >= 0) - close(dir_fd); - - if (ret < 0) - return -1; - - return 1; -} - -int -eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, - int socket, bool exact) -{ - int i, ret = -1; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - bool have_numa = false; - int oldpolicy; - struct bitmask *oldmask; -#endif - struct alloc_walk_param wa; - struct hugepage_info *hi = NULL; - - memset(&wa, 0, sizeof(wa)); - - /* dynamic allocation not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { - if (page_sz == - internal_config.hugepage_info[i].hugepage_sz) { - hi = &internal_config.hugepage_info[i]; - break; - } - } - if (!hi) { - RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", - __func__); - return -1; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (check_numa()) { - oldmask = numa_allocate_nodemask(); - prepare_numa(&oldpolicy, oldmask, socket); - have_numa = true; - } -#endif - - wa.exact = exact; - wa.hi = hi; - wa.ms = ms; - wa.n_segs = n_segs; - wa.page_sz = page_sz; - wa.socket = socket; - wa.segs_allocated = 0; - - /* memalloc is locked, so it's safe to use thread-unsafe version */ - ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); - if (ret == 0) { - RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", - __func__); - ret = -1; - } else if (ret > 0) { - ret = (int)wa.segs_allocated; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (have_numa) - restore_numa(&oldpolicy, oldmask); -#endif - return ret; -} - -struct rte_memseg * -eal_memalloc_alloc_seg(size_t page_sz, int socket) -{ - struct rte_memseg *ms; - if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) - return NULL; - /* return pointer to newly allocated memseg */ - return ms; -} - -int -eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) -{ - int seg, ret = 0; - - /* dynamic free not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - for (seg = 0; seg < n_segs; seg++) { - struct rte_memseg *cur = ms[seg]; - struct hugepage_info *hi = NULL; - struct free_walk_param wa; - int i, walk_res; - - /* if this page is marked as unfreeable, fail */ - if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { - RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); - ret = -1; - continue; - } - - memset(&wa, 0, sizeof(wa)); - - for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); - i++) { - hi = &internal_config.hugepage_info[i]; - if (cur->hugepage_sz == hi->hugepage_sz) - break; - } - if (i == (int)RTE_DIM(internal_config.hugepage_info)) { - RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); - ret = -1; - continue; - } - - wa.ms = cur; - wa.hi = hi; - - /* memalloc is locked, so it's safe to use thread-unsafe version - */ - walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, - &wa); - if (walk_res == 1) - continue; - if (walk_res == 0) - RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); - ret = -1; - } - return ret; -} - -int -eal_memalloc_free_seg(struct rte_memseg *ms) -{ - /* dynamic free not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - return eal_memalloc_free_seg_bulk(&ms, 1); -} - -static int -sync_chunk(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx, bool used, int start, int end) -{ - struct rte_fbarray *l_arr, *p_arr; - int i, ret, chunk_len, diff_len; - - l_arr = &local_msl->memseg_arr; - p_arr = &primary_msl->memseg_arr; - - /* we need to aggregate allocations/deallocations into bigger chunks, - * as we don't want to spam the user with per-page callbacks. - * - * to avoid any potential issues, we also want to trigger - * deallocation callbacks *before* we actually deallocate - * memory, so that the user application could wrap up its use - * before it goes away. - */ - - chunk_len = end - start; - - /* find how many contiguous pages we can map/unmap for this chunk */ - diff_len = used ? - rte_fbarray_find_contig_free(l_arr, start) : - rte_fbarray_find_contig_used(l_arr, start); - - /* has to be at least one page */ - if (diff_len < 1) - return -1; - - diff_len = RTE_MIN(chunk_len, diff_len); - - /* if we are freeing memory, notify the application */ - if (!used) { - struct rte_memseg *ms; - void *start_va; - size_t len, page_sz; - - ms = rte_fbarray_get(l_arr, start); - start_va = ms->addr; - page_sz = (size_t)primary_msl->page_sz; - len = page_sz * diff_len; - - eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, - start_va, len); - } - - for (i = 0; i < diff_len; i++) { - struct rte_memseg *p_ms, *l_ms; - int seg_idx = start + i; - - l_ms = rte_fbarray_get(l_arr, seg_idx); - p_ms = rte_fbarray_get(p_arr, seg_idx); - - if (l_ms == NULL || p_ms == NULL) - return -1; - - if (used) { - ret = alloc_seg(l_ms, p_ms->addr, - p_ms->socket_id, hi, - msl_idx, seg_idx); - if (ret < 0) - return -1; - rte_fbarray_set_used(l_arr, seg_idx); - } else { - ret = free_seg(l_ms, hi, msl_idx, seg_idx); - rte_fbarray_set_free(l_arr, seg_idx); - if (ret < 0) - return -1; - } - } - - /* if we just allocated memory, notify the application */ - if (used) { - struct rte_memseg *ms; - void *start_va; - size_t len, page_sz; - - ms = rte_fbarray_get(l_arr, start); - start_va = ms->addr; - page_sz = (size_t)primary_msl->page_sz; - len = page_sz * diff_len; - - eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, - start_va, len); - } - - /* calculate how much we can advance until next chunk */ - diff_len = used ? - rte_fbarray_find_contig_used(l_arr, start) : - rte_fbarray_find_contig_free(l_arr, start); - ret = RTE_MIN(chunk_len, diff_len); - - return ret; -} - -static int -sync_status(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx, bool used) -{ - struct rte_fbarray *l_arr, *p_arr; - int p_idx, l_chunk_len, p_chunk_len, ret; - int start, end; - - /* this is a little bit tricky, but the basic idea is - walk both lists - * and spot any places where there are discrepancies. walking both lists - * and noting discrepancies in a single go is a hard problem, so we do - * it in two passes - first we spot any places where allocated segments - * mismatch (i.e. ensure that everything that's allocated in the primary - * is also allocated in the secondary), and then we do it by looking at - * free segments instead. - * - * we also need to aggregate changes into chunks, as we have to call - * callbacks per allocation, not per page. - */ - l_arr = &local_msl->memseg_arr; - p_arr = &primary_msl->memseg_arr; - - if (used) - p_idx = rte_fbarray_find_next_used(p_arr, 0); - else - p_idx = rte_fbarray_find_next_free(p_arr, 0); - - while (p_idx >= 0) { - int next_chunk_search_idx; - - if (used) { - p_chunk_len = rte_fbarray_find_contig_used(p_arr, - p_idx); - l_chunk_len = rte_fbarray_find_contig_used(l_arr, - p_idx); - } else { - p_chunk_len = rte_fbarray_find_contig_free(p_arr, - p_idx); - l_chunk_len = rte_fbarray_find_contig_free(l_arr, - p_idx); - } - /* best case scenario - no differences (or bigger, which will be - * fixed during next iteration), look for next chunk - */ - if (l_chunk_len >= p_chunk_len) { - next_chunk_search_idx = p_idx + p_chunk_len; - goto next_chunk; - } - - /* if both chunks start at the same point, skip parts we know - * are identical, and sync the rest. each call to sync_chunk - * will only sync contiguous segments, so we need to call this - * until we are sure there are no more differences in this - * chunk. - */ - start = p_idx + l_chunk_len; - end = p_idx + p_chunk_len; - do { - ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, - used, start, end); - start += ret; - } while (start < end && ret >= 0); - /* if ret is negative, something went wrong */ - if (ret < 0) - return -1; - - next_chunk_search_idx = p_idx + p_chunk_len; -next_chunk: - /* skip to end of this chunk */ - if (used) { - p_idx = rte_fbarray_find_next_used(p_arr, - next_chunk_search_idx); - } else { - p_idx = rte_fbarray_find_next_free(p_arr, - next_chunk_search_idx); - } - } - return 0; -} - -static int -sync_existing(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx) -{ - int ret, dir_fd; - - /* do not allow any page allocations during the time we're allocating, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - */ - dir_fd = open(hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, - hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, - hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - - /* ensure all allocated space is the same in both lists */ - ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); - if (ret < 0) - goto fail; - - /* ensure all unallocated space is the same in both lists */ - ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); - if (ret < 0) - goto fail; - - /* update version number */ - local_msl->version = primary_msl->version; - - close(dir_fd); - - return 0; -fail: - close(dir_fd); - return -1; -} - -static int -sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *primary_msl, *local_msl; - struct hugepage_info *hi = NULL; - unsigned int i; - int msl_idx; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - primary_msl = &mcfg->memsegs[msl_idx]; - local_msl = &local_memsegs[msl_idx]; - - for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { - uint64_t cur_sz = - internal_config.hugepage_info[i].hugepage_sz; - uint64_t msl_sz = primary_msl->page_sz; - if (msl_sz == cur_sz) { - hi = &internal_config.hugepage_info[i]; - break; - } - } - if (!hi) { - RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); - return -1; - } - - /* if versions don't match, synchronize everything */ - if (local_msl->version != primary_msl->version && - sync_existing(primary_msl, local_msl, hi, msl_idx)) - return -1; - return 0; -} - - -int -eal_memalloc_sync_with_primary(void) -{ - /* nothing to be done in primary */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - return 0; - - /* memalloc is locked, so it's safe to call thread-unsafe version */ - if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) - return -1; - return 0; -} - -static int -secondary_msl_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *primary_msl, *local_msl; - char name[PATH_MAX]; - int msl_idx, ret; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - primary_msl = &mcfg->memsegs[msl_idx]; - local_msl = &local_memsegs[msl_idx]; - - /* create distinct fbarrays for each secondary */ - snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", - primary_msl->memseg_arr.name, getpid()); - - ret = rte_fbarray_init(&local_msl->memseg_arr, name, - primary_msl->memseg_arr.len, - primary_msl->memseg_arr.elt_sz); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); - return -1; - } - local_msl->base_va = primary_msl->base_va; - local_msl->len = primary_msl->len; - - return 0; -} - -static int -alloc_list(int list_idx, int len) -{ - int *data; - int i; - - /* ensure we have space to store fd per each possible segment */ - data = malloc(sizeof(int) * len); - if (data == NULL) { - RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); - return -1; - } - /* set all fd's as invalid */ - for (i = 0; i < len; i++) - data[i] = -1; - - fd_list[list_idx].fds = data; - fd_list[list_idx].len = len; - fd_list[list_idx].count = 0; - fd_list[list_idx].memseg_list_fd = -1; - - return 0; -} - -static int -fd_list_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned int len; - int msl_idx; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - len = msl->memseg_arr.len; - - return alloc_list(msl_idx, len); -} - -int -eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - - /* single file segments mode doesn't support individual segment fd's */ - if (internal_config.single_file_segments) - return -ENOTSUP; - - /* if list is not allocated, allocate it */ - if (fd_list[list_idx].len == 0) { - int len = mcfg->memsegs[list_idx].memseg_arr.len; - - if (alloc_list(list_idx, len) < 0) - return -ENOMEM; - } - fd_list[list_idx].fds[seg_idx] = fd; - - return 0; -} - -int -eal_memalloc_set_seg_list_fd(int list_idx, int fd) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - - /* non-single file segment mode doesn't support segment list fd's */ - if (!internal_config.single_file_segments) - return -ENOTSUP; - - /* if list is not allocated, allocate it */ - if (fd_list[list_idx].len == 0) { - int len = mcfg->memsegs[list_idx].memseg_arr.len; - - if (alloc_list(list_idx, len) < 0) - return -ENOMEM; - } - - fd_list[list_idx].memseg_list_fd = fd; - - return 0; -} - -int -eal_memalloc_get_seg_fd(int list_idx, int seg_idx) -{ - int fd; - - if (internal_config.in_memory || internal_config.no_hugetlbfs) { -#ifndef MEMFD_SUPPORTED - /* in in-memory or no-huge mode, we rely on memfd support */ - return -ENOTSUP; -#endif - /* memfd supported, but hugetlbfs memfd may not be */ - if (!internal_config.no_hugetlbfs && !memfd_create_supported) - return -ENOTSUP; - } - - if (internal_config.single_file_segments) { - fd = fd_list[list_idx].memseg_list_fd; - } else if (fd_list[list_idx].len == 0) { - /* list not initialized */ - fd = -1; - } else { - fd = fd_list[list_idx].fds[seg_idx]; - } - if (fd < 0) - return -ENODEV; - return fd; -} - -static int -test_memfd_create(void) -{ -#ifdef MEMFD_SUPPORTED - unsigned int i; - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; - int pagesz_flag = pagesz_flags(pagesz); - int flags; - - flags = pagesz_flag | RTE_MFD_HUGETLB; - int fd = memfd_create("test", flags); - if (fd < 0) { - /* we failed - let memalloc know this isn't working */ - if (errno == EINVAL) { - memfd_create_supported = 0; - return 0; /* not supported */ - } - - /* we got other error - something's wrong */ - return -1; /* error */ - } - close(fd); - return 1; /* supported */ - } -#endif - return 0; /* not supported */ -} - -int -eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - - if (internal_config.in_memory || internal_config.no_hugetlbfs) { -#ifndef MEMFD_SUPPORTED - /* in in-memory or no-huge mode, we rely on memfd support */ - return -ENOTSUP; -#endif - /* memfd supported, but hugetlbfs memfd may not be */ - if (!internal_config.no_hugetlbfs && !memfd_create_supported) - return -ENOTSUP; - } - - /* fd_list not initialized? */ - if (fd_list[list_idx].len == 0) - return -ENODEV; - if (internal_config.single_file_segments) { - size_t pgsz = mcfg->memsegs[list_idx].page_sz; - - /* segment not active? */ - if (fd_list[list_idx].memseg_list_fd < 0) - return -ENOENT; - *offset = pgsz * seg_idx; - } else { - /* segment not active? */ - if (fd_list[list_idx].fds[seg_idx] < 0) - return -ENOENT; - *offset = 0; - } - return 0; -} - -int -eal_memalloc_init(void) -{ - if (rte_eal_process_type() == RTE_PROC_SECONDARY) - if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) - return -1; - if (rte_eal_process_type() == RTE_PROC_PRIMARY && - internal_config.in_memory) { - int mfd_res = test_memfd_create(); - - if (mfd_res < 0) { - RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); - return -1; - } - if (mfd_res == 1) - RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); - else - RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); - - /* we only support single-file segments mode with in-memory mode - * if we support hugetlbfs with memfd_create. this code will - * test if we do. - */ - if (internal_config.single_file_segments && - mfd_res != 1) { - RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); - return -1; - } - /* this cannot ever happen but better safe than sorry */ - if (!anonymous_hugepages_supported) { - RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); - return -1; - } - } - - /* initialize all of the fd lists */ - if (rte_memseg_list_walk(fd_list_create_walk, NULL)) - return -1; - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c deleted file mode 100644 index 1b96b576e0..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ /dev/null @@ -1,2439 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation. - * Copyright(c) 2013 6WIND S.A. - */ - -#define _FILE_OFFSET_BITS 64 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ -#include -#define MEMFD_SUPPORTED -#endif -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_memalloc.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_hugepages.h" - -#define PFN_MASK_SIZE 8 - -/** - * @file - * Huge page mapping under linux - * - * To reserve a big contiguous amount of memory, we use the hugepage - * feature of linux. For that, we need to have hugetlbfs mounted. This - * code will create many files in this directory (one per page) and - * map them in virtual memory. For each page, we will retrieve its - * physical address and remap it in order to have a virtual contiguous - * zone as well as a physical contiguous zone. - */ - -static bool phys_addrs_available = true; - -#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" - -static void -test_phys_addrs_available(void) -{ - uint64_t tmp = 0; - phys_addr_t physaddr; - - if (!rte_eal_has_hugepages()) { - RTE_LOG(ERR, EAL, - "Started without hugepages support, physical addresses not available\n"); - phys_addrs_available = false; - return; - } - - physaddr = rte_mem_virt2phy(&tmp); - if (physaddr == RTE_BAD_PHYS_ADDR) { - if (rte_eal_iova_mode() == RTE_IOVA_PA) - RTE_LOG(ERR, EAL, - "Cannot obtain physical addresses: %s. " - "Only vfio will function.\n", - strerror(errno)); - phys_addrs_available = false; - } -} - -/* - * Get physical address of any mapped virtual address in the current process. - */ -phys_addr_t -rte_mem_virt2phy(const void *virtaddr) -{ - int fd, retval; - uint64_t page, physaddr; - unsigned long virt_pfn; - int page_size; - off_t offset; - - /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ - if (!phys_addrs_available) - return RTE_BAD_IOVA; - - /* standard page size */ - page_size = getpagesize(); - - fd = open("/proc/self/pagemap", O_RDONLY); - if (fd < 0) { - RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - return RTE_BAD_IOVA; - } - - virt_pfn = (unsigned long)virtaddr / page_size; - offset = sizeof(uint64_t) * virt_pfn; - if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { - RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - close(fd); - return RTE_BAD_IOVA; - } - - retval = read(fd, &page, PFN_MASK_SIZE); - close(fd); - if (retval < 0) { - RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - return RTE_BAD_IOVA; - } else if (retval != PFN_MASK_SIZE) { - RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap " - "but expected %d:\n", - __func__, retval, PFN_MASK_SIZE); - return RTE_BAD_IOVA; - } - - /* - * the pfn (page frame number) are bits 0-54 (see - * pagemap.txt in linux Documentation) - */ - if ((page & 0x7fffffffffffffULL) == 0) - return RTE_BAD_IOVA; - - physaddr = ((page & 0x7fffffffffffffULL) * page_size) - + ((unsigned long)virtaddr % page_size); - - return physaddr; -} - -rte_iova_t -rte_mem_virt2iova(const void *virtaddr) -{ - if (rte_eal_iova_mode() == RTE_IOVA_VA) - return (uintptr_t)virtaddr; - return rte_mem_virt2phy(virtaddr); -} - -/* - * For each hugepage in hugepg_tbl, fill the physaddr value. We find - * it by browsing the /proc/self/pagemap special file. - */ -static int -find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - unsigned int i; - phys_addr_t addr; - - for (i = 0; i < hpi->num_pages[0]; i++) { - addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); - if (addr == RTE_BAD_PHYS_ADDR) - return -1; - hugepg_tbl[i].physaddr = addr; - } - return 0; -} - -/* - * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. - */ -static int -set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - unsigned int i; - static phys_addr_t addr; - - for (i = 0; i < hpi->num_pages[0]; i++) { - hugepg_tbl[i].physaddr = addr; - addr += hugepg_tbl[i].size; - } - return 0; -} - -/* - * Check whether address-space layout randomization is enabled in - * the kernel. This is important for multi-process as it can prevent - * two processes mapping data to the same virtual address - * Returns: - * 0 - address space randomization disabled - * 1/2 - address space randomization enabled - * negative error code on error - */ -static int -aslr_enabled(void) -{ - char c; - int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); - if (fd < 0) - return -errno; - retval = read(fd, &c, 1); - close(fd); - if (retval < 0) - return -errno; - if (retval == 0) - return -EIO; - switch (c) { - case '0' : return 0; - case '1' : return 1; - case '2' : return 2; - default: return -EINVAL; - } -} - -static sigjmp_buf huge_jmpenv; - -static void huge_sigbus_handler(int signo __rte_unused) -{ - siglongjmp(huge_jmpenv, 1); -} - -/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, - * non-static local variable in the stack frame calling sigsetjmp might be - * clobbered by a call to longjmp. - */ -static int huge_wrap_sigsetjmp(void) -{ - return sigsetjmp(huge_jmpenv, 1); -} - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -/* Callback for numa library. */ -void numa_error(char *where) -{ - RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); -} -#endif - -/* - * Mmap all hugepages of hugepage table: it first open a file in - * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the - * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored - * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to - * map contiguous physical blocks in contiguous virtual blocks. - */ -static unsigned -map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, - uint64_t *essential_memory __rte_unused) -{ - int fd; - unsigned i; - void *virtaddr; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - int node_id = -1; - int essential_prev = 0; - int oldpolicy; - struct bitmask *oldmask = NULL; - bool have_numa = true; - unsigned long maxnode = 0; - - /* Check if kernel supports NUMA. */ - if (numa_available() != 0) { - RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); - have_numa = false; - } - - if (have_numa) { - RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); - oldmask = numa_allocate_nodemask(); - if (get_mempolicy(&oldpolicy, oldmask->maskp, - oldmask->size + 1, 0, 0) < 0) { - RTE_LOG(ERR, EAL, - "Failed to get current mempolicy: %s. " - "Assuming MPOL_DEFAULT.\n", strerror(errno)); - oldpolicy = MPOL_DEFAULT; - } - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - if (internal_config.socket_mem[i]) - maxnode = i + 1; - } -#endif - - for (i = 0; i < hpi->num_pages[0]; i++) { - struct hugepage_file *hf = &hugepg_tbl[i]; - uint64_t hugepage_sz = hpi->hugepage_sz; - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) { - unsigned int j; - - for (j = 0; j < maxnode; j++) - if (essential_memory[j]) - break; - - if (j == maxnode) { - node_id = (node_id + 1) % maxnode; - while (!internal_config.socket_mem[node_id]) { - node_id++; - node_id %= maxnode; - } - essential_prev = 0; - } else { - node_id = j; - essential_prev = essential_memory[j]; - - if (essential_memory[j] < hugepage_sz) - essential_memory[j] = 0; - else - essential_memory[j] -= hugepage_sz; - } - - RTE_LOG(DEBUG, EAL, - "Setting policy MPOL_PREFERRED for socket %d\n", - node_id); - numa_set_preferred(node_id); - } -#endif - - hf->file_id = i; - hf->size = hugepage_sz; - eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), - hpi->hugedir, hf->file_id); - hf->filepath[sizeof(hf->filepath) - 1] = '\0'; - - /* try to create hugepage file */ - fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - goto out; - } - - /* map the segment, and populate page tables, - * the kernel fills this segment with zeros. we don't care where - * this gets mapped - we already have contiguous memory areas - * ready for us to map into. - */ - virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, 0); - if (virtaddr == MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, - strerror(errno)); - close(fd); - goto out; - } - - hf->orig_va = virtaddr; - - /* In linux, hugetlb limitations, like cgroup, are - * enforced at fault time instead of mmap(), even - * with the option of MAP_POPULATE. Kernel will send - * a SIGBUS signal. To avoid to be killed, save stack - * environment here, if SIGBUS happens, we can jump - * back here. - */ - if (huge_wrap_sigsetjmp()) { - RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " - "hugepages of size %u MB\n", - (unsigned int)(hugepage_sz / 0x100000)); - munmap(virtaddr, hugepage_sz); - close(fd); - unlink(hugepg_tbl[i].filepath); -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) - essential_memory[node_id] = - essential_prev; -#endif - goto out; - } - *(int *)virtaddr = 0; - - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", - __func__, strerror(errno)); - close(fd); - goto out; - } - - close(fd); - } - -out: -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) { - RTE_LOG(DEBUG, EAL, - "Restoring previous memory policy: %d\n", oldpolicy); - if (oldpolicy == MPOL_DEFAULT) { - numa_set_localalloc(); - } else if (set_mempolicy(oldpolicy, oldmask->maskp, - oldmask->size + 1) < 0) { - RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", - strerror(errno)); - numa_set_localalloc(); - } - } - if (oldmask != NULL) - numa_free_cpumask(oldmask); -#endif - return i; -} - -/* - * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge - * page. - */ -static int -find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - int socket_id; - char *end, *nodestr; - unsigned i, hp_count = 0; - uint64_t virt_addr; - char buf[BUFSIZ]; - char hugedir_str[PATH_MAX]; - FILE *f; - - f = fopen("/proc/self/numa_maps", "r"); - if (f == NULL) { - RTE_LOG(NOTICE, EAL, "NUMA support not available" - " consider that all memory is in socket_id 0\n"); - return 0; - } - - snprintf(hugedir_str, sizeof(hugedir_str), - "%s/%s", hpi->hugedir, eal_get_hugefile_prefix()); - - /* parse numa map */ - while (fgets(buf, sizeof(buf), f) != NULL) { - - /* ignore non huge page */ - if (strstr(buf, " huge ") == NULL && - strstr(buf, hugedir_str) == NULL) - continue; - - /* get zone addr */ - virt_addr = strtoull(buf, &end, 16); - if (virt_addr == 0 || end == buf) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - - /* get node id (socket id) */ - nodestr = strstr(buf, " N"); - if (nodestr == NULL) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - nodestr += 2; - end = strstr(nodestr, "="); - if (end == NULL) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - end[0] = '\0'; - end = NULL; - - socket_id = strtoul(nodestr, &end, 0); - if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - - /* if we find this page in our mappings, set socket_id */ - for (i = 0; i < hpi->num_pages[0]; i++) { - void *va = (void *)(unsigned long)virt_addr; - if (hugepg_tbl[i].orig_va == va) { - hugepg_tbl[i].socket_id = socket_id; - hp_count++; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - RTE_LOG(DEBUG, EAL, - "Hugepage %s is on socket %d\n", - hugepg_tbl[i].filepath, socket_id); -#endif - } - } - } - - if (hp_count < hpi->num_pages[0]) - goto error; - - fclose(f); - return 0; - -error: - fclose(f); - return -1; -} - -static int -cmp_physaddr(const void *a, const void *b) -{ -#ifndef RTE_ARCH_PPC_64 - const struct hugepage_file *p1 = a; - const struct hugepage_file *p2 = b; -#else - /* PowerPC needs memory sorted in reverse order from x86 */ - const struct hugepage_file *p1 = b; - const struct hugepage_file *p2 = a; -#endif - if (p1->physaddr < p2->physaddr) - return -1; - else if (p1->physaddr > p2->physaddr) - return 1; - else - return 0; -} - -/* - * Uses mmap to create a shared memory area for storage of data - * Used in this file to store the hugepage file map on disk - */ -static void * -create_shared_memory(const char *filename, const size_t mem_size) -{ - void *retval; - int fd; - - /* if no shared files mode is used, create anonymous memory instead */ - if (internal_config.no_shconf) { - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (retval == MAP_FAILED) - return NULL; - return retval; - } - - fd = open(filename, O_CREAT | O_RDWR, 0666); - if (fd < 0) - return NULL; - if (ftruncate(fd, mem_size) < 0) { - close(fd); - return NULL; - } - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - if (retval == MAP_FAILED) - return NULL; - return retval; -} - -/* - * this copies *active* hugepages from one hugepage table to another. - * destination is typically the shared memory. - */ -static int -copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, - const struct hugepage_file * src, int src_size) -{ - int src_pos, dst_pos = 0; - - for (src_pos = 0; src_pos < src_size; src_pos++) { - if (src[src_pos].orig_va != NULL) { - /* error on overflow attempt */ - if (dst_pos == dest_size) - return -1; - memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); - dst_pos++; - } - } - return 0; -} - -static int -unlink_hugepage_files(struct hugepage_file *hugepg_tbl, - unsigned num_hp_info) -{ - unsigned socket, size; - int page, nrpages = 0; - - /* get total number of hugepages */ - for (size = 0; size < num_hp_info; size++) - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) - nrpages += - internal_config.hugepage_info[size].num_pages[socket]; - - for (page = 0; page < nrpages; page++) { - struct hugepage_file *hp = &hugepg_tbl[page]; - - if (hp->orig_va != NULL && unlink(hp->filepath)) { - RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", - __func__, hp->filepath, strerror(errno)); - } - } - return 0; -} - -/* - * unmaps hugepages that are not going to be used. since we originally allocate - * ALL hugepages (not just those we need), additional unmapping needs to be done. - */ -static int -unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, - struct hugepage_info *hpi, - unsigned num_hp_info) -{ - unsigned socket, size; - int page, nrpages = 0; - - /* get total number of hugepages */ - for (size = 0; size < num_hp_info; size++) - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) - nrpages += internal_config.hugepage_info[size].num_pages[socket]; - - for (size = 0; size < num_hp_info; size++) { - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { - unsigned pages_found = 0; - - /* traverse until we have unmapped all the unused pages */ - for (page = 0; page < nrpages; page++) { - struct hugepage_file *hp = &hugepg_tbl[page]; - - /* find a page that matches the criteria */ - if ((hp->size == hpi[size].hugepage_sz) && - (hp->socket_id == (int) socket)) { - - /* if we skipped enough pages, unmap the rest */ - if (pages_found == hpi[size].num_pages[socket]) { - uint64_t unmap_len; - - unmap_len = hp->size; - - /* get start addr and len of the remaining segment */ - munmap(hp->orig_va, - (size_t)unmap_len); - - hp->orig_va = NULL; - if (unlink(hp->filepath) == -1) { - RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", - __func__, hp->filepath, strerror(errno)); - return -1; - } - } else { - /* lock the page and skip */ - pages_found++; - } - - } /* match page */ - } /* foreach page */ - } /* foreach socket */ - } /* foreach pagesize */ - - return 0; -} - -static int -remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *msl; - struct rte_fbarray *arr; - int cur_page, seg_len; - unsigned int msl_idx; - int ms_idx; - uint64_t page_sz; - size_t memseg_len; - int socket_id; - - page_sz = hugepages[seg_start].size; - socket_id = hugepages[seg_start].socket_id; - seg_len = seg_end - seg_start; - - RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", - (seg_len * page_sz) >> 20ULL, socket_id); - - /* find free space in memseg lists */ - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - bool empty; - msl = &mcfg->memsegs[msl_idx]; - arr = &msl->memseg_arr; - - if (msl->page_sz != page_sz) - continue; - if (msl->socket_id != socket_id) - continue; - - /* leave space for a hole if array is not empty */ - empty = arr->count == 0; - ms_idx = rte_fbarray_find_next_n_free(arr, 0, - seg_len + (empty ? 0 : 1)); - - /* memseg list is full? */ - if (ms_idx < 0) - continue; - - /* leave some space between memsegs, they are not IOVA - * contiguous, so they shouldn't be VA contiguous either. - */ - if (!empty) - ms_idx++; - break; - } - if (msl_idx == RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), - RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); - return -1; - } - -#ifdef RTE_ARCH_PPC64 - /* for PPC64 we go through the list backwards */ - for (cur_page = seg_end - 1; cur_page >= seg_start; - cur_page--, ms_idx++) { -#else - for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { -#endif - struct hugepage_file *hfile = &hugepages[cur_page]; - struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); - void *addr; - int fd; - - fd = open(hfile->filepath, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", - hfile->filepath, strerror(errno)); - return -1; - } - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", - hfile->filepath, strerror(errno)); - close(fd); - return -1; - } - memseg_len = (size_t)page_sz; - addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); - - /* we know this address is already mmapped by memseg list, so - * using MAP_FIXED here is safe - */ - addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", - hfile->filepath, strerror(errno)); - close(fd); - return -1; - } - - /* we have a new address, so unmap previous one */ -#ifndef RTE_ARCH_64 - /* in 32-bit legacy mode, we have already unmapped the page */ - if (!internal_config.legacy_mem) - munmap(hfile->orig_va, page_sz); -#else - munmap(hfile->orig_va, page_sz); -#endif - - hfile->orig_va = NULL; - hfile->final_va = addr; - - /* rewrite physical addresses in IOVA as VA mode */ - if (rte_eal_iova_mode() == RTE_IOVA_VA) - hfile->physaddr = (uintptr_t)addr; - - /* set up memseg data */ - ms->addr = addr; - ms->hugepage_sz = page_sz; - ms->len = memseg_len; - ms->iova = hfile->physaddr; - ms->socket_id = hfile->socket_id; - ms->nchannel = rte_memory_get_nchannel(); - ms->nrank = rte_memory_get_nrank(); - - rte_fbarray_set_used(arr, ms_idx); - - /* store segment fd internally */ - if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) - RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", - rte_strerror(rte_errno)); - } - RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", - (seg_len * page_sz) >> 20, socket_id); - return 0; -} - -static uint64_t -get_mem_amount(uint64_t page_sz, uint64_t max_mem) -{ - uint64_t area_sz, max_pages; - - /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ - max_pages = RTE_MAX_MEMSEG_PER_LIST; - max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); - - area_sz = RTE_MIN(page_sz * max_pages, max_mem); - - /* make sure the list isn't smaller than the page size */ - area_sz = RTE_MAX(area_sz, page_sz); - - return RTE_ALIGN(area_sz, page_sz); -} - -static int -free_memseg_list(struct rte_memseg_list *msl) -{ - if (rte_fbarray_destroy(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); - return -1; - } - memset(msl, 0, sizeof(*msl)); - return 0; -} - -#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" -static int -alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, - int n_segs, int socket_id, int type_msl_idx) -{ - char name[RTE_FBARRAY_NAME_LEN]; - - snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, - type_msl_idx); - if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", - rte_strerror(rte_errno)); - return -1; - } - - msl->page_sz = page_sz; - msl->socket_id = socket_id; - msl->base_va = NULL; - - RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", - (size_t)page_sz >> 10, socket_id); - - return 0; -} - -static int -alloc_va_space(struct rte_memseg_list *msl) -{ - uint64_t page_sz; - size_t mem_sz; - void *addr; - int flags = 0; - - page_sz = msl->page_sz; - mem_sz = page_sz * msl->memseg_arr.len; - - addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); - if (addr == NULL) { - if (rte_errno == EADDRNOTAVAIL) - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n", - (unsigned long long)mem_sz, msl->base_va); - else - RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); - return -1; - } - msl->base_va = addr; - msl->len = mem_sz; - - return 0; -} - -/* - * Our VA space is not preallocated yet, so preallocate it here. We need to know - * how many segments there are in order to map all pages into one address space, - * and leave appropriate holes between segments so that rte_malloc does not - * concatenate them into one big segment. - * - * we also need to unmap original pages to free up address space. - */ -static int __rte_unused -prealloc_segments(struct hugepage_file *hugepages, int n_pages) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int cur_page, seg_start_page, end_seg, new_memseg; - unsigned int hpi_idx, socket, i; - int n_contig_segs, n_segs; - int msl_idx; - - /* before we preallocate segments, we need to free up our VA space. - * we're not removing files, and we already have information about - * PA-contiguousness, so it is safe to unmap everything. - */ - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *hpi = &hugepages[cur_page]; - munmap(hpi->orig_va, hpi->size); - hpi->orig_va = NULL; - } - - /* we cannot know how many page sizes and sockets we have discovered, so - * loop over all of them - */ - for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; - hpi_idx++) { - uint64_t page_sz = - internal_config.hugepage_info[hpi_idx].hugepage_sz; - - for (i = 0; i < rte_socket_count(); i++) { - struct rte_memseg_list *msl; - - socket = rte_socket_id_by_idx(i); - n_contig_segs = 0; - n_segs = 0; - seg_start_page = -1; - - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *prev, *cur; - int prev_seg_start_page = -1; - - cur = &hugepages[cur_page]; - prev = cur_page == 0 ? NULL : - &hugepages[cur_page - 1]; - - new_memseg = 0; - end_seg = 0; - - if (cur->size == 0) - end_seg = 1; - else if (cur->socket_id != (int) socket) - end_seg = 1; - else if (cur->size != page_sz) - end_seg = 1; - else if (cur_page == 0) - new_memseg = 1; -#ifdef RTE_ARCH_PPC_64 - /* On PPC64 architecture, the mmap always start - * from higher address to lower address. Here, - * physical addresses are in descending order. - */ - else if ((prev->physaddr - cur->physaddr) != - cur->size) - new_memseg = 1; -#else - else if ((cur->physaddr - prev->physaddr) != - cur->size) - new_memseg = 1; -#endif - if (new_memseg) { - /* if we're already inside a segment, - * new segment means end of current one - */ - if (seg_start_page != -1) { - end_seg = 1; - prev_seg_start_page = - seg_start_page; - } - seg_start_page = cur_page; - } - - if (end_seg) { - if (prev_seg_start_page != -1) { - /* we've found a new segment */ - n_contig_segs++; - n_segs += cur_page - - prev_seg_start_page; - } else if (seg_start_page != -1) { - /* we didn't find new segment, - * but did end current one - */ - n_contig_segs++; - n_segs += cur_page - - seg_start_page; - seg_start_page = -1; - continue; - } else { - /* we're skipping this page */ - continue; - } - } - /* segment continues */ - } - /* check if we missed last segment */ - if (seg_start_page != -1) { - n_contig_segs++; - n_segs += cur_page - seg_start_page; - } - - /* if no segments were found, do not preallocate */ - if (n_segs == 0) - continue; - - /* we now have total number of pages that we will - * allocate for this segment list. add separator pages - * to the total count, and preallocate VA space. - */ - n_segs += n_contig_segs - 1; - - /* now, preallocate VA space for these segments */ - - /* first, find suitable memseg list for this */ - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; - msl_idx++) { - msl = &mcfg->memsegs[msl_idx]; - - if (msl->base_va != NULL) - continue; - break; - } - if (msl_idx == RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - /* now, allocate fbarray itself */ - if (alloc_memseg_list(msl, page_sz, n_segs, socket, - msl_idx) < 0) - return -1; - - /* finally, allocate VA space */ - if (alloc_va_space(msl) < 0) - return -1; - } - } - return 0; -} - -/* - * We cannot reallocate memseg lists on the fly because PPC64 stores pages - * backwards, therefore we have to process the entire memseg first before - * remapping it into memseg list VA space. - */ -static int -remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) -{ - int cur_page, seg_start_page, new_memseg, ret; - - seg_start_page = 0; - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *prev, *cur; - - new_memseg = 0; - - cur = &hugepages[cur_page]; - prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; - - /* if size is zero, no more pages left */ - if (cur->size == 0) - break; - - if (cur_page == 0) - new_memseg = 1; - else if (cur->socket_id != prev->socket_id) - new_memseg = 1; - else if (cur->size != prev->size) - new_memseg = 1; -#ifdef RTE_ARCH_PPC_64 - /* On PPC64 architecture, the mmap always start from higher - * address to lower address. Here, physical addresses are in - * descending order. - */ - else if ((prev->physaddr - cur->physaddr) != cur->size) - new_memseg = 1; -#else - else if ((cur->physaddr - prev->physaddr) != cur->size) - new_memseg = 1; -#endif - - if (new_memseg) { - /* if this isn't the first time, remap segment */ - if (cur_page != 0) { - ret = remap_segment(hugepages, seg_start_page, - cur_page); - if (ret != 0) - return -1; - } - /* remember where we started */ - seg_start_page = cur_page; - } - /* continuation of previous memseg */ - } - /* we were stopped, but we didn't remap the last segment, do it now */ - if (cur_page != 0) { - ret = remap_segment(hugepages, seg_start_page, - cur_page); - if (ret != 0) - return -1; - } - return 0; -} - -static inline uint64_t -get_socket_mem_size(int socket) -{ - uint64_t size = 0; - unsigned i; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++){ - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - size += hpi->hugepage_sz * hpi->num_pages[socket]; - } - - return size; -} - -/* - * This function is a NUMA-aware equivalent of calc_num_pages. - * It takes in the list of hugepage sizes and the - * number of pages thereof, and calculates the best number of - * pages of each size to fulfill the request for ram - */ -static int -calc_num_pages_per_socket(uint64_t * memory, - struct hugepage_info *hp_info, - struct hugepage_info *hp_used, - unsigned num_hp_info) -{ - unsigned socket, j, i = 0; - unsigned requested, available; - int total_num_pages = 0; - uint64_t remaining_mem, cur_mem; - uint64_t total_mem = internal_config.memory; - - if (num_hp_info == 0) - return -1; - - /* if specific memory amounts per socket weren't requested */ - if (internal_config.force_sockets == 0) { - size_t total_size; -#ifdef RTE_ARCH_64 - int cpu_per_socket[RTE_MAX_NUMA_NODES]; - size_t default_size; - unsigned lcore_id; - - /* Compute number of cores per socket */ - memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); - RTE_LCORE_FOREACH(lcore_id) { - cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; - } - - /* - * Automatically spread requested memory amongst detected sockets according - * to number of cores from cpu mask present on each socket - */ - total_size = internal_config.memory; - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { - - /* Set memory amount per socket */ - default_size = (internal_config.memory * cpu_per_socket[socket]) - / rte_lcore_count(); - - /* Limit to maximum available memory on socket */ - default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); - - /* Update sizes */ - memory[socket] = default_size; - total_size -= default_size; - } - - /* - * If some memory is remaining, try to allocate it by getting all - * available memory from sockets, one after the other - */ - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { - /* take whatever is available */ - default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], - total_size); - - /* Update sizes */ - memory[socket] += default_size; - total_size -= default_size; - } -#else - /* in 32-bit mode, allocate all of the memory only on master - * lcore socket - */ - total_size = internal_config.memory; - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; - socket++) { - struct rte_config *cfg = rte_eal_get_configuration(); - unsigned int master_lcore_socket; - - master_lcore_socket = - rte_lcore_to_socket_id(cfg->master_lcore); - - if (master_lcore_socket != socket) - continue; - - /* Update sizes */ - memory[socket] = total_size; - break; - } -#endif - } - - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { - /* skips if the memory on specific socket wasn't requested */ - for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ - strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, - sizeof(hp_used[i].hugedir)); - hp_used[i].num_pages[socket] = RTE_MIN( - memory[socket] / hp_info[i].hugepage_sz, - hp_info[i].num_pages[socket]); - - cur_mem = hp_used[i].num_pages[socket] * - hp_used[i].hugepage_sz; - - memory[socket] -= cur_mem; - total_mem -= cur_mem; - - total_num_pages += hp_used[i].num_pages[socket]; - - /* check if we have met all memory requests */ - if (memory[socket] == 0) - break; - - /* check if we have any more pages left at this size, if so - * move on to next size */ - if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) - continue; - /* At this point we know that there are more pages available that are - * bigger than the memory we want, so lets see if we can get enough - * from other page sizes. - */ - remaining_mem = 0; - for (j = i+1; j < num_hp_info; j++) - remaining_mem += hp_info[j].hugepage_sz * - hp_info[j].num_pages[socket]; - - /* is there enough other memory, if not allocate another page and quit */ - if (remaining_mem < memory[socket]){ - cur_mem = RTE_MIN(memory[socket], - hp_info[i].hugepage_sz); - memory[socket] -= cur_mem; - total_mem -= cur_mem; - hp_used[i].num_pages[socket]++; - total_num_pages++; - break; /* we are done with this socket*/ - } - } - /* if we didn't satisfy all memory requirements per socket */ - if (memory[socket] > 0 && - internal_config.socket_mem[socket] != 0) { - /* to prevent icc errors */ - requested = (unsigned) (internal_config.socket_mem[socket] / - 0x100000); - available = requested - - ((unsigned) (memory[socket] / 0x100000)); - RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " - "Requested: %uMB, available: %uMB\n", socket, - requested, available); - return -1; - } - } - - /* if we didn't satisfy total memory requirements */ - if (total_mem > 0) { - requested = (unsigned) (internal_config.memory / 0x100000); - available = requested - (unsigned) (total_mem / 0x100000); - RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," - " available: %uMB\n", requested, available); - return -1; - } - return total_num_pages; -} - -static inline size_t -eal_get_hugepage_mem_size(void) -{ - uint64_t size = 0; - unsigned i, j; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - size += hpi->hugepage_sz * hpi->num_pages[j]; - } - } - } - - return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; -} - -static struct sigaction huge_action_old; -static int huge_need_recover; - -static void -huge_register_sigbus(void) -{ - sigset_t mask; - struct sigaction action; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = 0; - action.sa_mask = mask; - action.sa_handler = huge_sigbus_handler; - - huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); -} - -static void -huge_recover_sigbus(void) -{ - if (huge_need_recover) { - sigaction(SIGBUS, &huge_action_old, NULL); - huge_need_recover = 0; - } -} - -/* - * Prepare physical memory mapping: fill configuration structure with - * these infos, return 0 on success. - * 1. map N huge pages in separate files in hugetlbfs - * 2. find associated physical addr - * 3. find associated NUMA socket ID - * 4. sort all huge pages by physical address - * 5. remap these N huge pages in the correct order - * 6. unmap the first mapping - * 7. fill memsegs in configuration with contiguous zones - */ -static int -eal_legacy_hugepage_init(void) -{ - struct rte_mem_config *mcfg; - struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; - struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; - struct rte_fbarray *arr; - struct rte_memseg *ms; - - uint64_t memory[RTE_MAX_NUMA_NODES]; - - unsigned hp_offset; - int i, j; - int nr_hugefiles, nr_hugepages = 0; - void *addr; - - test_phys_addrs_available(); - - memset(used_hp, 0, sizeof(used_hp)); - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - /* hugetlbfs can be disabled */ - if (internal_config.no_hugetlbfs) { - struct rte_memseg_list *msl; - int n_segs, cur_seg, fd, flags; -#ifdef MEMFD_SUPPORTED - int memfd; -#endif - uint64_t page_sz; - - /* nohuge mode is legacy mode */ - internal_config.legacy_mem = 1; - - /* nohuge mode is single-file segments mode */ - internal_config.single_file_segments = 1; - - /* create a memseg list */ - msl = &mcfg->memsegs[0]; - - page_sz = RTE_PGSIZE_4K; - n_segs = internal_config.memory / page_sz; - - if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - /* set up parameters for anonymous mmap */ - fd = -1; - flags = MAP_PRIVATE | MAP_ANONYMOUS; - -#ifdef MEMFD_SUPPORTED - /* create a memfd and store it in the segment fd table */ - memfd = memfd_create("nohuge", 0); - if (memfd < 0) { - RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n", - strerror(errno)); - RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n"); - } else { - /* we got an fd - now resize it */ - if (ftruncate(memfd, internal_config.memory) < 0) { - RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n", - strerror(errno)); - RTE_LOG(ERR, EAL, "Falling back to anonymous map\n"); - close(memfd); - } else { - /* creating memfd-backed file was successful. - * we want changes to memfd to be visible to - * other processes (such as vhost backend), so - * map it as shared memory. - */ - RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); - fd = memfd; - flags = MAP_SHARED; - } - } -#endif - addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, - flags, fd, 0); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, - strerror(errno)); - return -1; - } - msl->base_va = addr; - msl->page_sz = page_sz; - msl->socket_id = 0; - msl->len = internal_config.memory; - - /* we're in single-file segments mode, so only the segment list - * fd needs to be set up. - */ - if (fd != -1) { - if (eal_memalloc_set_seg_list_fd(0, fd) < 0) { - RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n"); - /* not a serious error, proceed */ - } - } - - /* populate memsegs. each memseg is one page long */ - for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { - arr = &msl->memseg_arr; - - ms = rte_fbarray_get(arr, cur_seg); - if (rte_eal_iova_mode() == RTE_IOVA_VA) - ms->iova = (uintptr_t)addr; - else - ms->iova = RTE_BAD_IOVA; - ms->addr = addr; - ms->hugepage_sz = page_sz; - ms->socket_id = 0; - ms->len = page_sz; - - rte_fbarray_set_used(arr, cur_seg); - - addr = RTE_PTR_ADD(addr, (size_t)page_sz); - } - if (mcfg->dma_maskbits && - rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { - RTE_LOG(ERR, EAL, - "%s(): couldnt allocate memory due to IOVA exceeding limits of current DMA mask.\n", - __func__); - if (rte_eal_iova_mode() == RTE_IOVA_VA && - rte_eal_using_phys_addrs()) - RTE_LOG(ERR, EAL, - "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n", - __func__); - goto fail; - } - return 0; - } - - /* calculate total number of hugepages available. at this point we haven't - * yet started sorting them so they all are on socket 0 */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { - /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ - used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; - - nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; - } - - /* - * allocate a memory area for hugepage table. - * this isn't shared memory yet. due to the fact that we need some - * processing done on these pages, shared memory will be created - * at a later stage. - */ - tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); - if (tmp_hp == NULL) - goto fail; - - memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); - - hp_offset = 0; /* where we start the current page size entries */ - - huge_register_sigbus(); - - /* make a copy of socket_mem, needed for balanced allocation. */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - memory[i] = internal_config.socket_mem[i]; - - /* map all hugepages and sort them */ - for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ - unsigned pages_old, pages_new; - struct hugepage_info *hpi; - - /* - * we don't yet mark hugepages as used at this stage, so - * we just map all hugepages available to the system - * all hugepages are still located on socket 0 - */ - hpi = &internal_config.hugepage_info[i]; - - if (hpi->num_pages[0] == 0) - continue; - - /* map all hugepages available */ - pages_old = hpi->num_pages[0]; - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); - if (pages_new < pages_old) { - RTE_LOG(DEBUG, EAL, - "%d not %d hugepages of size %u MB allocated\n", - pages_new, pages_old, - (unsigned)(hpi->hugepage_sz / 0x100000)); - - int pages = pages_old - pages_new; - - nr_hugepages -= pages; - hpi->num_pages[0] = pages_new; - if (pages_new == 0) - continue; - } - - if (phys_addrs_available && - rte_eal_iova_mode() != RTE_IOVA_VA) { - /* find physical addresses for each hugepage */ - if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { - RTE_LOG(DEBUG, EAL, "Failed to find phys addr " - "for %u MB pages\n", - (unsigned int)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - } else { - /* set physical addresses for each hugepage */ - if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { - RTE_LOG(DEBUG, EAL, "Failed to set phys addr " - "for %u MB pages\n", - (unsigned int)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - } - - if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - - qsort(&tmp_hp[hp_offset], hpi->num_pages[0], - sizeof(struct hugepage_file), cmp_physaddr); - - /* we have processed a num of hugepages of this size, so inc offset */ - hp_offset += hpi->num_pages[0]; - } - - huge_recover_sigbus(); - - if (internal_config.memory == 0 && internal_config.force_sockets == 0) - internal_config.memory = eal_get_hugepage_mem_size(); - - nr_hugefiles = nr_hugepages; - - - /* clean out the numbers of pages */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) - internal_config.hugepage_info[i].num_pages[j] = 0; - - /* get hugepages for each socket */ - for (i = 0; i < nr_hugefiles; i++) { - int socket = tmp_hp[i].socket_id; - - /* find a hugepage info with right size and increment num_pages */ - const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, - (int)internal_config.num_hugepage_sizes); - for (j = 0; j < nb_hpsizes; j++) { - if (tmp_hp[i].size == - internal_config.hugepage_info[j].hugepage_sz) { - internal_config.hugepage_info[j].num_pages[socket]++; - } - } - } - - /* make a copy of socket_mem, needed for number of pages calculation */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - memory[i] = internal_config.socket_mem[i]; - - /* calculate final number of pages */ - nr_hugepages = calc_num_pages_per_socket(memory, - internal_config.hugepage_info, used_hp, - internal_config.num_hugepage_sizes); - - /* error if not enough memory available */ - if (nr_hugepages < 0) - goto fail; - - /* reporting in! */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - if (used_hp[i].num_pages[j] > 0) { - RTE_LOG(DEBUG, EAL, - "Requesting %u pages of size %uMB" - " from socket %i\n", - used_hp[i].num_pages[j], - (unsigned) - (used_hp[i].hugepage_sz / 0x100000), - j); - } - } - } - - /* create shared memory */ - hugepage = create_shared_memory(eal_hugepage_data_path(), - nr_hugefiles * sizeof(struct hugepage_file)); - - if (hugepage == NULL) { - RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); - goto fail; - } - memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); - - /* - * unmap pages that we won't need (looks at used_hp). - * also, sets final_va to NULL on pages that were unmapped. - */ - if (unmap_unneeded_hugepages(tmp_hp, used_hp, - internal_config.num_hugepage_sizes) < 0) { - RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); - goto fail; - } - - /* - * copy stuff from malloc'd hugepage* to the actual shared memory. - * this procedure only copies those hugepages that have orig_va - * not NULL. has overflow protection. - */ - if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, - tmp_hp, nr_hugefiles) < 0) { - RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); - goto fail; - } - -#ifndef RTE_ARCH_64 - /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ - if (internal_config.legacy_mem && - prealloc_segments(hugepage, nr_hugefiles)) { - RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); - goto fail; - } -#endif - - /* remap all pages we do need into memseg list VA space, so that those - * pages become first-class citizens in DPDK memory subsystem - */ - if (remap_needed_hugepages(hugepage, nr_hugefiles)) { - RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); - goto fail; - } - - /* free the hugepage backing files */ - if (internal_config.hugepage_unlink && - unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { - RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); - goto fail; - } - - /* free the temporary hugepage table */ - free(tmp_hp); - tmp_hp = NULL; - - munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - hugepage = NULL; - - /* we're not going to allocate more pages, so release VA space for - * unused memseg lists - */ - for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { - struct rte_memseg_list *msl = &mcfg->memsegs[i]; - size_t mem_sz; - - /* skip inactive lists */ - if (msl->base_va == NULL) - continue; - /* skip lists where there is at least one page allocated */ - if (msl->memseg_arr.count > 0) - continue; - /* this is an unused list, deallocate it */ - mem_sz = msl->len; - munmap(msl->base_va, mem_sz); - msl->base_va = NULL; - - /* destroy backing fbarray */ - rte_fbarray_destroy(&msl->memseg_arr); - } - - if (mcfg->dma_maskbits && - rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { - RTE_LOG(ERR, EAL, - "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", - __func__); - goto fail; - } - - return 0; - -fail: - huge_recover_sigbus(); - free(tmp_hp); - if (hugepage != NULL) - munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - - return -1; -} - -static int __rte_unused -hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct hugepage_info *hpi = arg; - - if (msl->page_sz != hpi->hugepage_sz) - return 0; - - hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; - return 0; -} - -static int -limits_callback(int socket_id, size_t cur_limit, size_t new_len) -{ - RTE_SET_USED(socket_id); - RTE_SET_USED(cur_limit); - RTE_SET_USED(new_len); - return -1; -} - -static int -eal_hugepage_init(void) -{ - struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; - uint64_t memory[RTE_MAX_NUMA_NODES]; - int hp_sz_idx, socket_id; - - test_phys_addrs_available(); - - memset(used_hp, 0, sizeof(used_hp)); - - for (hp_sz_idx = 0; - hp_sz_idx < (int) internal_config.num_hugepage_sizes; - hp_sz_idx++) { -#ifndef RTE_ARCH_64 - struct hugepage_info dummy; - unsigned int i; -#endif - /* also initialize used_hp hugepage sizes in used_hp */ - struct hugepage_info *hpi; - hpi = &internal_config.hugepage_info[hp_sz_idx]; - used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; - -#ifndef RTE_ARCH_64 - /* for 32-bit, limit number of pages on socket to whatever we've - * preallocated, as we cannot allocate more. - */ - memset(&dummy, 0, sizeof(dummy)); - dummy.hugepage_sz = hpi->hugepage_sz; - if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) - return -1; - - for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { - hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], - dummy.num_pages[i]); - } -#endif - } - - /* make a copy of socket_mem, needed for balanced allocation. */ - for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) - memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; - - /* calculate final number of pages */ - if (calc_num_pages_per_socket(memory, - internal_config.hugepage_info, used_hp, - internal_config.num_hugepage_sizes) < 0) - return -1; - - for (hp_sz_idx = 0; - hp_sz_idx < (int)internal_config.num_hugepage_sizes; - hp_sz_idx++) { - for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; - socket_id++) { - struct rte_memseg **pages; - struct hugepage_info *hpi = &used_hp[hp_sz_idx]; - unsigned int num_pages = hpi->num_pages[socket_id]; - int num_pages_alloc, i; - - if (num_pages == 0) - continue; - - pages = malloc(sizeof(*pages) * num_pages); - - RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", - num_pages, hpi->hugepage_sz >> 20, socket_id); - - num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages, - num_pages, hpi->hugepage_sz, - socket_id, true); - if (num_pages_alloc < 0) { - free(pages); - return -1; - } - - /* mark preallocated pages as unfreeable */ - for (i = 0; i < num_pages_alloc; i++) { - struct rte_memseg *ms = pages[i]; - ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; - } - free(pages); - } - } - /* if socket limits were specified, set them */ - if (internal_config.force_socket_limits) { - unsigned int i; - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { - uint64_t limit = internal_config.socket_limit[i]; - if (limit == 0) - continue; - if (rte_mem_alloc_validator_register("socket-limit", - limits_callback, i, limit)) - RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); - } - } - return 0; -} - -/* - * uses fstat to report the size of a file on disk - */ -static off_t -getFileSize(int fd) -{ - struct stat st; - if (fstat(fd, &st) < 0) - return 0; - return st.st_size; -} - -/* - * This creates the memory mappings in the secondary process to match that of - * the server process. It goes through each memory segment in the DPDK runtime - * configuration and finds the hugepages which form that segment, mapping them - * in order to form a contiguous block in the virtual memory space - */ -static int -eal_legacy_hugepage_attach(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct hugepage_file *hp = NULL; - unsigned int num_hp = 0; - unsigned int i = 0; - unsigned int cur_seg; - off_t size = 0; - int fd, fd_hugepage = -1; - - if (aslr_enabled() > 0) { - RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " - "(ASLR) is enabled in the kernel.\n"); - RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " - "into secondary processes\n"); - } - - test_phys_addrs_available(); - - fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); - if (fd_hugepage < 0) { - RTE_LOG(ERR, EAL, "Could not open %s\n", - eal_hugepage_data_path()); - goto error; - } - - size = getFileSize(fd_hugepage); - hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); - if (hp == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Could not mmap %s\n", - eal_hugepage_data_path()); - goto error; - } - - num_hp = size / sizeof(struct hugepage_file); - RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); - - /* map all segments into memory to make sure we get the addrs. the - * segments themselves are already in memseg list (which is shared and - * has its VA space already preallocated), so we just need to map - * everything into correct addresses. - */ - for (i = 0; i < num_hp; i++) { - struct hugepage_file *hf = &hp[i]; - size_t map_sz = hf->size; - void *map_addr = hf->final_va; - int msl_idx, ms_idx; - struct rte_memseg_list *msl; - struct rte_memseg *ms; - - /* if size is zero, no more pages left */ - if (map_sz == 0) - break; - - fd = open(hf->filepath, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Could not open %s: %s\n", - hf->filepath, strerror(errno)); - goto error; - } - - map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, fd, 0); - if (map_addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Could not map %s: %s\n", - hf->filepath, strerror(errno)); - goto fd_error; - } - - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", - __func__, strerror(errno)); - goto fd_error; - } - - /* find segment data */ - msl = rte_mem_virt2memseg_list(map_addr); - if (msl == NULL) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", - __func__); - goto fd_error; - } - ms = rte_mem_virt2memseg(map_addr, msl); - if (ms == NULL) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", - __func__); - goto fd_error; - } - - msl_idx = msl - mcfg->memsegs; - ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); - if (ms_idx < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", - __func__); - goto fd_error; - } - - /* store segment fd internally */ - if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) - RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", - rte_strerror(rte_errno)); - } - /* unmap the hugepage config file, since we are done using it */ - munmap(hp, size); - close(fd_hugepage); - return 0; - -fd_error: - close(fd); -error: - /* map all segments into memory to make sure we get the addrs */ - cur_seg = 0; - for (cur_seg = 0; cur_seg < i; cur_seg++) { - struct hugepage_file *hf = &hp[i]; - size_t map_sz = hf->size; - void *map_addr = hf->final_va; - - munmap(map_addr, map_sz); - } - if (hp != NULL && hp != MAP_FAILED) - munmap(hp, size); - if (fd_hugepage >= 0) - close(fd_hugepage); - return -1; -} - -static int -eal_hugepage_attach(void) -{ - if (eal_memalloc_sync_with_primary()) { - RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); - if (aslr_enabled() > 0) - RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); - return -1; - } - return 0; -} - -int -rte_eal_hugepage_init(void) -{ - return internal_config.legacy_mem ? - eal_legacy_hugepage_init() : - eal_hugepage_init(); -} - -int -rte_eal_hugepage_attach(void) -{ - return internal_config.legacy_mem ? - eal_legacy_hugepage_attach() : - eal_hugepage_attach(); -} - -int -rte_eal_using_phys_addrs(void) -{ - return phys_addrs_available; -} - -static int __rte_unused -memseg_primary_init_32(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int active_sockets, hpi_idx, msl_idx = 0; - unsigned int socket_id, i; - struct rte_memseg_list *msl; - uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; - uint64_t max_mem; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* this is a giant hack, but desperate times call for desperate - * measures. in legacy 32-bit mode, we cannot preallocate VA space, - * because having upwards of 2 gigabytes of VA space already mapped will - * interfere with our ability to map and sort hugepages. - * - * therefore, in legacy 32-bit mode, we will be initializing memseg - * lists much later - in eal_memory.c, right after we unmap all the - * unneeded pages. this will not affect secondary processes, as those - * should be able to mmap the space without (too many) problems. - */ - if (internal_config.legacy_mem) - return 0; - - /* 32-bit mode is a very special case. we cannot know in advance where - * the user will want to allocate their memory, so we have to do some - * heuristics. - */ - active_sockets = 0; - total_requested_mem = 0; - if (internal_config.force_sockets) - for (i = 0; i < rte_socket_count(); i++) { - uint64_t mem; - - socket_id = rte_socket_id_by_idx(i); - mem = internal_config.socket_mem[socket_id]; - - if (mem == 0) - continue; - - active_sockets++; - total_requested_mem += mem; - } - else - total_requested_mem = internal_config.memory; - - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - if (total_requested_mem > max_mem) { - RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", - (unsigned int)(max_mem >> 20)); - return -1; - } - total_extra_mem = max_mem - total_requested_mem; - extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : - total_extra_mem / active_sockets; - - /* the allocation logic is a little bit convoluted, but here's how it - * works, in a nutshell: - * - if user hasn't specified on which sockets to allocate memory via - * --socket-mem, we allocate all of our memory on master core socket. - * - if user has specified sockets to allocate memory on, there may be - * some "unused" memory left (e.g. if user has specified --socket-mem - * such that not all memory adds up to 2 gigabytes), so add it to all - * sockets that are in use equally. - * - * page sizes are sorted by size in descending order, so we can safely - * assume that we dispense with bigger page sizes first. - */ - - /* create memseg lists */ - for (i = 0; i < rte_socket_count(); i++) { - int hp_sizes = (int) internal_config.num_hugepage_sizes; - uint64_t max_socket_mem, cur_socket_mem; - unsigned int master_lcore_socket; - struct rte_config *cfg = rte_eal_get_configuration(); - bool skip; - - socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (socket_id > 0) - break; -#endif - - /* if we didn't specifically request memory on this socket */ - skip = active_sockets != 0 && - internal_config.socket_mem[socket_id] == 0; - /* ...or if we didn't specifically request memory on *any* - * socket, and this is not master lcore - */ - master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); - skip |= active_sockets == 0 && socket_id != master_lcore_socket; - - if (skip) { - RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", - socket_id); - continue; - } - - /* max amount of memory on this socket */ - max_socket_mem = (active_sockets != 0 ? - internal_config.socket_mem[socket_id] : - internal_config.memory) + - extra_mem_per_socket; - cur_socket_mem = 0; - - for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { - uint64_t max_pagesz_mem, cur_pagesz_mem = 0; - uint64_t hugepage_sz; - struct hugepage_info *hpi; - int type_msl_idx, max_segs, total_segs = 0; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - /* check if pages are actually available */ - if (hpi->num_pages[socket_id] == 0) - continue; - - max_segs = RTE_MAX_MEMSEG_PER_TYPE; - max_pagesz_mem = max_socket_mem - cur_socket_mem; - - /* make it multiple of page size */ - max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, - hugepage_sz); - - RTE_LOG(DEBUG, EAL, "Attempting to preallocate " - "%" PRIu64 "M on socket %i\n", - max_pagesz_mem >> 20, socket_id); - - type_msl_idx = 0; - while (cur_pagesz_mem < max_pagesz_mem && - total_segs < max_segs) { - uint64_t cur_mem; - unsigned int n_segs; - - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - msl = &mcfg->memsegs[msl_idx]; - - cur_mem = get_mem_amount(hugepage_sz, - max_pagesz_mem); - n_segs = cur_mem / hugepage_sz; - - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - socket_id, type_msl_idx)) { - /* failing to allocate a memseg list is - * a serious error. - */ - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - if (alloc_va_space(msl)) { - /* if we couldn't allocate VA space, we - * can try with smaller page sizes. - */ - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); - /* deallocate memseg list */ - if (free_memseg_list(msl)) - return -1; - break; - } - - total_segs += msl->memseg_arr.len; - cur_pagesz_mem = total_segs * hugepage_sz; - type_msl_idx++; - msl_idx++; - } - cur_socket_mem += cur_pagesz_mem; - } - if (cur_socket_mem == 0) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", - socket_id); - return -1; - } - } - - return 0; -} - -static int __rte_unused -memseg_primary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct memtype { - uint64_t page_sz; - int socket_id; - } *memtypes = NULL; - int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ - struct rte_memseg_list *msl; - uint64_t max_mem, max_mem_per_type; - unsigned int max_seglists_per_type; - unsigned int n_memtypes, cur_type; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* - * figuring out amount of memory we're going to have is a long and very - * involved process. the basic element we're operating with is a memory - * type, defined as a combination of NUMA node ID and page size (so that - * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). - * - * deciding amount of memory going towards each memory type is a - * balancing act between maximum segments per type, maximum memory per - * type, and number of detected NUMA nodes. the goal is to make sure - * each memory type gets at least one memseg list. - * - * the total amount of memory is limited by RTE_MAX_MEM_MB value. - * - * the total amount of memory per type is limited by either - * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number - * of detected NUMA nodes. additionally, maximum number of segments per - * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for - * smaller page sizes, it can take hundreds of thousands of segments to - * reach the above specified per-type memory limits. - * - * additionally, each type may have multiple memseg lists associated - * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger - * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. - * - * the number of memseg lists per type is decided based on the above - * limits, and also taking number of detected NUMA nodes, to make sure - * that we don't run out of memseg lists before we populate all NUMA - * nodes with memory. - * - * we do this in three stages. first, we collect the number of types. - * then, we figure out memory constraints and populate the list of - * would-be memseg lists. then, we go ahead and allocate the memseg - * lists. - */ - - /* create space for mem types */ - n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); - memtypes = calloc(n_memtypes, sizeof(*memtypes)); - if (memtypes == NULL) { - RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); - return -1; - } - - /* populate mem types */ - cur_type = 0; - for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; - hpi_idx++) { - struct hugepage_info *hpi; - uint64_t hugepage_sz; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { - int socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (socket_id > 0) - break; -#endif - memtypes[cur_type].page_sz = hugepage_sz; - memtypes[cur_type].socket_id = socket_id; - - RTE_LOG(DEBUG, EAL, "Detected memory type: " - "socket_id:%u hugepage_sz:%" PRIu64 "\n", - socket_id, hugepage_sz); - } - } - /* number of memtypes could have been lower due to no NUMA support */ - n_memtypes = cur_type; - - /* set up limits for types */ - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, - max_mem / n_memtypes); - /* - * limit maximum number of segment lists per type to ensure there's - * space for memseg lists for all NUMA nodes with all page sizes - */ - max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; - - if (max_seglists_per_type == 0) { - RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - goto out; - } - - /* go through all mem types and create segment lists */ - msl_idx = 0; - for (cur_type = 0; cur_type < n_memtypes; cur_type++) { - unsigned int cur_seglist, n_seglists, n_segs; - unsigned int max_segs_per_type, max_segs_per_list; - struct memtype *type = &memtypes[cur_type]; - uint64_t max_mem_per_list, pagesz; - int socket_id; - - pagesz = type->page_sz; - socket_id = type->socket_id; - - /* - * we need to create segment lists for this type. we must take - * into account the following things: - * - * 1. total amount of memory we can use for this memory type - * 2. total amount of memory per memseg list allowed - * 3. number of segments needed to fit the amount of memory - * 4. number of segments allowed per type - * 5. number of segments allowed per memseg list - * 6. number of memseg lists we are allowed to take up - */ - - /* calculate how much segments we will need in total */ - max_segs_per_type = max_mem_per_type / pagesz; - /* limit number of segments to maximum allowed per type */ - max_segs_per_type = RTE_MIN(max_segs_per_type, - (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); - /* limit number of segments to maximum allowed per list */ - max_segs_per_list = RTE_MIN(max_segs_per_type, - (unsigned int)RTE_MAX_MEMSEG_PER_LIST); - - /* calculate how much memory we can have per segment list */ - max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, - (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); - - /* calculate how many segments each segment list will have */ - n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); - - /* calculate how many segment lists we can have */ - n_seglists = RTE_MIN(max_segs_per_type / n_segs, - max_mem_per_type / max_mem_per_list); - - /* limit number of segment lists according to our maximum */ - n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); - - RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " - "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", - n_seglists, n_segs, socket_id, pagesz); - - /* create all segment lists */ - for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - goto out; - } - msl = &mcfg->memsegs[msl_idx++]; - - if (alloc_memseg_list(msl, pagesz, n_segs, - socket_id, cur_seglist)) - goto out; - - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - goto out; - } - } - } - /* we're successful */ - ret = 0; -out: - free(memtypes); - return ret; -} - -static int -memseg_secondary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx = 0; - struct rte_memseg_list *msl; - - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - - msl = &mcfg->memsegs[msl_idx]; - - /* skip empty memseg lists */ - if (msl->memseg_arr.len == 0) - continue; - - if (rte_fbarray_attach(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); - return -1; - } - - /* preallocate VA space */ - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); - return -1; - } - } - - return 0; -} - -int -rte_eal_memseg_init(void) -{ - /* increase rlimit to maximum */ - struct rlimit lim; - - if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { - /* set limit to maximum */ - lim.rlim_cur = lim.rlim_max; - - if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { - RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", - strerror(errno)); - } else { - RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" - PRIu64 "\n", - (uint64_t)lim.rlim_cur); - } - } else { - RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); - } - - return rte_eal_process_type() == RTE_PROC_PRIMARY ? -#ifndef RTE_ARCH_64 - memseg_primary_init_32() : -#else - memseg_primary_init() : -#endif - memseg_secondary_init(); -} diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c deleted file mode 100644 index 379773b683..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_thread.c +++ /dev/null @@ -1,188 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" - -RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; -RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; -RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); - -/* - * Send a message to a slave lcore identified by slave_id to call a - * function f with argument arg. Once the execution is done, the - * remote lcore switch in FINISHED state. - */ -int -rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) -{ - int n; - char c = 0; - int m2s = lcore_config[slave_id].pipe_master2slave[1]; - int s2m = lcore_config[slave_id].pipe_slave2master[0]; - - if (lcore_config[slave_id].state != WAIT) - return -EBUSY; - - lcore_config[slave_id].f = f; - lcore_config[slave_id].arg = arg; - - /* send message */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(m2s, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - /* wait ack */ - do { - n = read(s2m, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - return 0; -} - -/* set affinity for current EAL thread */ -static int -eal_thread_set_affinity(void) -{ - unsigned lcore_id = rte_lcore_id(); - - /* acquire system unique id */ - rte_gettid(); - - /* update EAL thread core affinity */ - return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); -} - -void eal_thread_init_master(unsigned lcore_id) -{ - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); -} - -/* main loop of threads */ -__attribute__((noreturn)) void * -eal_thread_loop(__attribute__((unused)) void *arg) -{ - char c; - int n, ret; - unsigned lcore_id; - pthread_t thread_id; - int m2s, s2m; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - - thread_id = pthread_self(); - - /* retrieve our lcore_id from the configuration structure */ - RTE_LCORE_FOREACH_SLAVE(lcore_id) { - if (thread_id == lcore_config[lcore_id].thread_id) - break; - } - if (lcore_id == RTE_MAX_LCORE) - rte_panic("cannot retrieve lcore id\n"); - - m2s = lcore_config[lcore_id].pipe_master2slave[0]; - s2m = lcore_config[lcore_id].pipe_slave2master[1]; - - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", - lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); - - /* read on our pipe to get commands */ - while (1) { - void *fct_arg; - - /* wait command */ - do { - n = read(m2s, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - lcore_config[lcore_id].state = RUNNING; - - /* send ack */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(s2m, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - if (lcore_config[lcore_id].f == NULL) - rte_panic("NULL function pointer\n"); - - /* call the function and store the return value */ - fct_arg = lcore_config[lcore_id].arg; - ret = lcore_config[lcore_id].f(fct_arg); - lcore_config[lcore_id].ret = ret; - rte_wmb(); - - /* when a service core returns, it should go directly to WAIT - * state, because the application will not lcore_wait() for it. - */ - if (lcore_config[lcore_id].core_role == ROLE_SERVICE) - lcore_config[lcore_id].state = WAIT; - else - lcore_config[lcore_id].state = FINISHED; - } - - /* never reached */ - /* pthread_exit(NULL); */ - /* return NULL; */ -} - -/* require calling thread tid by gettid() */ -int rte_sys_gettid(void) -{ - return (int)syscall(SYS_gettid); -} - -int rte_thread_setname(pthread_t id, const char *name) -{ - int ret = ENOSYS; -#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 12) - ret = pthread_setname_np(id, name); -#endif -#endif - RTE_SET_USED(id); - RTE_SET_USED(name); - return -ret; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c deleted file mode 100644 index bc8f051990..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_timer.c +++ /dev/null @@ -1,266 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation. - * Copyright(c) 2012-2013 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_internal_cfg.h" - -enum timer_source eal_timer_source = EAL_TIMER_HPET; - -#ifdef RTE_LIBEAL_USE_HPET - -#define DEV_HPET "/dev/hpet" - -/* Maximum number of counters. */ -#define HPET_TIMER_NUM 3 - -/* General capabilities register */ -#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ -#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ - -/** - * HPET timer registers. From the Intel IA-PC HPET (High Precision Event - * Timers) Specification. - */ -struct eal_hpet_regs { - /* Memory-mapped, software visible registers */ - uint64_t capabilities; /**< RO General Capabilities Register. */ - uint64_t reserved0; /**< Reserved for future use. */ - uint64_t config; /**< RW General Configuration Register. */ - uint64_t reserved1; /**< Reserved for future use. */ - uint64_t isr; /**< RW Clear General Interrupt Status. */ - uint64_t reserved2[25]; /**< Reserved for future use. */ - union { - uint64_t counter; /**< RW Main Counter Value Register. */ - struct { - uint32_t counter_l; /**< RW Main Counter Low. */ - uint32_t counter_h; /**< RW Main Counter High. */ - }; - }; - uint64_t reserved3; /**< Reserved for future use. */ - struct { - uint64_t config; /**< RW Timer Config and Capability Reg. */ - uint64_t comp; /**< RW Timer Comparator Value Register. */ - uint64_t fsb; /**< RW FSB Interrupt Route Register. */ - uint64_t reserved4; /**< Reserved for future use. */ - } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ -}; - -/* Mmap'd hpet registers */ -static volatile struct eal_hpet_regs *eal_hpet = NULL; - -/* Period at which the HPET counter increments in - * femtoseconds (10^-15 seconds). */ -static uint32_t eal_hpet_resolution_fs = 0; - -/* Frequency of the HPET counter in Hz */ -static uint64_t eal_hpet_resolution_hz = 0; - -/* Incremented 4 times during one 32bits hpet full count */ -static uint32_t eal_hpet_msb; - -static pthread_t msb_inc_thread_id; - -/* - * This function runs on a specific thread to update a global variable - * containing used to process MSB of the HPET (unfortunately, we need - * this because hpet is 32 bits by default under linux). - */ -static void * -hpet_msb_inc(__attribute__((unused)) void *arg) -{ - uint32_t t; - - while (1) { - t = (eal_hpet->counter_l >> 30); - if (t != (eal_hpet_msb & 3)) - eal_hpet_msb ++; - sleep(10); - } - return NULL; -} - -uint64_t -rte_get_hpet_hz(void) -{ - if(internal_config.no_hpet) - rte_panic("Error, HPET called, but no HPET present\n"); - - return eal_hpet_resolution_hz; -} - -uint64_t -rte_get_hpet_cycles(void) -{ - uint32_t t, msb; - uint64_t ret; - - if(internal_config.no_hpet) - rte_panic("Error, HPET called, but no HPET present\n"); - - t = eal_hpet->counter_l; - msb = eal_hpet_msb; - ret = (msb + 2 - (t >> 30)) / 4; - ret <<= 32; - ret += t; - return ret; -} - -#endif - -#ifdef RTE_LIBEAL_USE_HPET -/* - * Open and mmap /dev/hpet (high precision event timer) that will - * provide our time reference. - */ -int -rte_eal_hpet_init(int make_default) -{ - int fd, ret; - - if (internal_config.no_hpet) { - RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); - return -1; - } - - fd = open(DEV_HPET, O_RDONLY); - if (fd < 0) { - RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", - strerror(errno)); - internal_config.no_hpet = 1; - return -1; - } - eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); - if (eal_hpet == MAP_FAILED) { - RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" - "Please enable CONFIG_HPET_MMAP in your kernel configuration " - "to allow HPET support.\n" - "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " - "in your build configuration or use '--no-hpet' EAL flag.\n"); - close(fd); - internal_config.no_hpet = 1; - return -1; - } - close(fd); - - eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & - CLK_PERIOD_MASK) >> - CLK_PERIOD_SHIFT); - - eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / - (uint64_t)eal_hpet_resolution_fs; - - RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", - eal_hpet_resolution_hz/1000); - - eal_hpet_msb = (eal_hpet->counter_l >> 30); - - /* create a thread that will increment a global variable for - * msb (hpet is 32 bits by default under linux) */ - ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, - hpet_msb_inc, NULL); - if (ret != 0) { - RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); - internal_config.no_hpet = 1; - return -1; - } - - if (make_default) - eal_timer_source = EAL_TIMER_HPET; - return 0; -} -#endif - -static void -check_tsc_flags(void) -{ - char line[512]; - FILE *stream; - - stream = fopen("/proc/cpuinfo", "r"); - if (!stream) { - RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n"); - return; - } - - while (fgets(line, sizeof line, stream)) { - char *constant_tsc; - char *nonstop_tsc; - - if (strncmp(line, "flags", 5) != 0) - continue; - - constant_tsc = strstr(line, "constant_tsc"); - nonstop_tsc = strstr(line, "nonstop_tsc"); - if (!constant_tsc || !nonstop_tsc) - RTE_LOG(WARNING, EAL, - "WARNING: cpu flags " - "constant_tsc=%s " - "nonstop_tsc=%s " - "-> using unreliable clock cycles !\n", - constant_tsc ? "yes":"no", - nonstop_tsc ? "yes":"no"); - break; - } - - fclose(stream); -} - -uint64_t -get_tsc_freq(void) -{ -#ifdef CLOCK_MONOTONIC_RAW -#define NS_PER_SEC 1E9 - - struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ - - struct timespec t_start, t_end; - uint64_t tsc_hz; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { - uint64_t ns, end, start = rte_rdtsc(); - nanosleep(&sleeptime,NULL); - clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); - end = rte_rdtsc(); - ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); - ns += (t_end.tv_nsec - t_start.tv_nsec); - - double secs = (double)ns/NS_PER_SEC; - tsc_hz = (uint64_t)((end - start)/secs); - return tsc_hz; - } -#endif - return 0; -} - -int -rte_eal_timer_init(void) -{ - - eal_timer_source = EAL_TIMER_TSC; - - set_tsc_freq(); - check_tsc_flags(); - return 0; -} diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c deleted file mode 100644 index c821e83826..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c +++ /dev/null @@ -1,2049 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "eal_filesystem.h" -#include "eal_vfio.h" -#include "eal_private.h" - -#ifdef VFIO_PRESENT - -#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" - -/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can - * recreate the mappings for DPDK segments, but we cannot do so for memory that - * was registered by the user themselves, so we need to store the user mappings - * somewhere, to recreate them later. - */ -#define VFIO_MAX_USER_MEM_MAPS 256 -struct user_mem_map { - uint64_t addr; - uint64_t iova; - uint64_t len; -}; - -struct user_mem_maps { - rte_spinlock_recursive_t lock; - int n_maps; - struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; -}; - -struct vfio_config { - int vfio_enabled; - int vfio_container_fd; - int vfio_active_groups; - const struct vfio_iommu_type *vfio_iommu_type; - struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; - struct user_mem_maps mem_maps; -}; - -/* per-process VFIO config */ -static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; -static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; - -static int vfio_type1_dma_map(int); -static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_spapr_dma_map(int); -static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_noiommu_dma_map(int); -static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, - uint64_t iova, uint64_t len, int do_map); - -/* IOMMU types we support */ -static const struct vfio_iommu_type iommu_types[] = { - /* x86 IOMMU, otherwise known as type 1 */ - { - .type_id = RTE_VFIO_TYPE1, - .name = "Type 1", - .dma_map_func = &vfio_type1_dma_map, - .dma_user_map_func = &vfio_type1_dma_mem_map - }, - /* ppc64 IOMMU, otherwise known as spapr */ - { - .type_id = RTE_VFIO_SPAPR, - .name = "sPAPR", - .dma_map_func = &vfio_spapr_dma_map, - .dma_user_map_func = &vfio_spapr_dma_mem_map - }, - /* IOMMU-less mode */ - { - .type_id = RTE_VFIO_NOIOMMU, - .name = "No-IOMMU", - .dma_map_func = &vfio_noiommu_dma_map, - .dma_user_map_func = &vfio_noiommu_dma_mem_map - }, -}; - -static int -is_null_map(const struct user_mem_map *map) -{ - return map->addr == 0 && map->iova == 0 && map->len == 0; -} - -/* we may need to merge user mem maps together in case of user mapping/unmapping - * chunks of memory, so we'll need a comparator function to sort segments. - */ -static int -user_mem_map_cmp(const void *a, const void *b) -{ - const struct user_mem_map *umm_a = a; - const struct user_mem_map *umm_b = b; - - /* move null entries to end */ - if (is_null_map(umm_a)) - return 1; - if (is_null_map(umm_b)) - return -1; - - /* sort by iova first */ - if (umm_a->iova < umm_b->iova) - return -1; - if (umm_a->iova > umm_b->iova) - return 1; - - if (umm_a->addr < umm_b->addr) - return -1; - if (umm_a->addr > umm_b->addr) - return 1; - - if (umm_a->len < umm_b->len) - return -1; - if (umm_a->len > umm_b->len) - return 1; - - return 0; -} - -/* adjust user map entry. this may result in shortening of existing map, or in - * splitting existing map in two pieces. - */ -static void -adjust_map(struct user_mem_map *src, struct user_mem_map *end, - uint64_t remove_va_start, uint64_t remove_len) -{ - /* if va start is same as start address, we're simply moving start */ - if (remove_va_start == src->addr) { - src->addr += remove_len; - src->iova += remove_len; - src->len -= remove_len; - } else if (remove_va_start + remove_len == src->addr + src->len) { - /* we're shrinking mapping from the end */ - src->len -= remove_len; - } else { - /* we're blowing a hole in the middle */ - struct user_mem_map tmp; - uint64_t total_len = src->len; - - /* adjust source segment length */ - src->len = remove_va_start - src->addr; - - /* create temporary segment in the middle */ - tmp.addr = src->addr + src->len; - tmp.iova = src->iova + src->len; - tmp.len = remove_len; - - /* populate end segment - this one we will be keeping */ - end->addr = tmp.addr + tmp.len; - end->iova = tmp.iova + tmp.len; - end->len = total_len - src->len - tmp.len; - } -} - -/* try merging two maps into one, return 1 if succeeded */ -static int -merge_map(struct user_mem_map *left, struct user_mem_map *right) -{ - if (left->addr + left->len != right->addr) - return 0; - if (left->iova + left->len != right->iova) - return 0; - - left->len += right->len; - - memset(right, 0, sizeof(*right)); - - return 1; -} - -static struct user_mem_map * -find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, - uint64_t iova, uint64_t len) -{ - uint64_t va_end = addr + len; - uint64_t iova_end = iova + len; - int i; - - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map = &user_mem_maps->maps[i]; - uint64_t map_va_end = map->addr + map->len; - uint64_t map_iova_end = map->iova + map->len; - - /* check start VA */ - if (addr < map->addr || addr >= map_va_end) - continue; - /* check if VA end is within boundaries */ - if (va_end <= map->addr || va_end > map_va_end) - continue; - - /* check start IOVA */ - if (iova < map->iova || iova >= map_iova_end) - continue; - /* check if IOVA end is within boundaries */ - if (iova_end <= map->iova || iova_end > map_iova_end) - continue; - - /* we've found our map */ - return map; - } - return NULL; -} - -/* this will sort all user maps, and merge/compact any adjacent maps */ -static void -compact_user_maps(struct user_mem_maps *user_mem_maps) -{ - int i, n_merged, cur_idx; - - qsort(user_mem_maps->maps, user_mem_maps->n_maps, - sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); - - /* we'll go over the list backwards when merging */ - n_merged = 0; - for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { - struct user_mem_map *l, *r; - - l = &user_mem_maps->maps[i]; - r = &user_mem_maps->maps[i + 1]; - - if (is_null_map(l) || is_null_map(r)) - continue; - - if (merge_map(l, r)) - n_merged++; - } - - /* the entries are still sorted, but now they have holes in them, so - * walk through the list and remove the holes - */ - if (n_merged > 0) { - cur_idx = 0; - for (i = 0; i < user_mem_maps->n_maps; i++) { - if (!is_null_map(&user_mem_maps->maps[i])) { - struct user_mem_map *src, *dst; - - src = &user_mem_maps->maps[i]; - dst = &user_mem_maps->maps[cur_idx++]; - - if (src != dst) { - memcpy(dst, src, sizeof(*src)); - memset(src, 0, sizeof(*src)); - } - } - } - user_mem_maps->n_maps = cur_idx; - } -} - -static int -vfio_open_group_fd(int iommu_group_num) -{ - int vfio_group_fd; - char filename[PATH_MAX]; - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - /* if primary, try to open the group */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* try regular group format */ - snprintf(filename, sizeof(filename), - VFIO_GROUP_FMT, iommu_group_num); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - /* if file not found, it's not an error */ - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - - /* special case: try no-IOMMU path as well */ - snprintf(filename, sizeof(filename), - VFIO_NOIOMMU_GROUP_FMT, - iommu_group_num); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - return 0; - } - /* noiommu group found */ - } - - return vfio_group_fd; - } - /* if we're in a secondary process, request group fd from the primary - * process via mp channel. - */ - p->req = SOCKET_REQ_GROUP; - p->group_num = iommu_group_num; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - vfio_group_fd = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - vfio_group_fd = mp_rep->fds[0]; - } else if (p->result == SOCKET_NO_FD) { - RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); - vfio_group_fd = 0; - } - free(mp_reply.msgs); - } - - if (vfio_group_fd < 0) - RTE_LOG(ERR, EAL, " cannot request group fd\n"); - return vfio_group_fd; -} - -static struct vfio_config * -get_vfio_cfg_by_group_num(int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) { - if (vfio_cfg->vfio_groups[j].group_num == - iommu_group_num) - return vfio_cfg; - } - } - - return NULL; -} - -static int -vfio_get_group_fd(struct vfio_config *vfio_cfg, - int iommu_group_num) -{ - int i; - int vfio_group_fd; - struct vfio_group *cur_grp; - - /* check if we already have the group descriptor open */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) - return vfio_cfg->vfio_groups[i].fd; - - /* Lets see first if there is room for a new group */ - if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - - /* Now lets get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == -1) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); - return -1; - } - - vfio_group_fd = vfio_open_group_fd(iommu_group_num); - if (vfio_group_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); - return -1; - } - - cur_grp->group_num = iommu_group_num; - cur_grp->fd = vfio_group_fd; - vfio_cfg->vfio_active_groups++; - - return vfio_group_fd; -} - -static struct vfio_config * -get_vfio_cfg_by_group_fd(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return vfio_cfg; - } - - return NULL; -} - -static struct vfio_config * -get_vfio_cfg_by_container_fd(int container_fd) -{ - int i; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == container_fd) - return &vfio_cfgs[i]; - } - - return NULL; -} - -int -rte_vfio_get_group_fd(int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - - return vfio_get_group_fd(vfio_cfg, iommu_group_num); -} - -static int -get_vfio_group_idx(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return j; - } - - return -1; -} - -static void -vfio_group_device_get(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - else - vfio_cfg->vfio_groups[i].devices++; -} - -static void -vfio_group_device_put(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - else - vfio_cfg->vfio_groups[i].devices--; -} - -static int -vfio_group_device_count(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return -1; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - return -1; - } - - return vfio_cfg->vfio_groups[i].devices; -} - -static void -vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, - void *arg __rte_unused) -{ - struct rte_memseg_list *msl; - struct rte_memseg *ms; - size_t cur_len = 0; - - msl = rte_mem_virt2memseg_list(addr); - - /* for IOVA as VA mode, no need to care for IOVA addresses */ - if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { - uint64_t vfio_va = (uint64_t)(uintptr_t)addr; - if (type == RTE_MEM_EVENT_ALLOC) - vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, - len, 1); - else - vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, - len, 0); - return; - } - - /* memsegs are contiguous in memory */ - ms = rte_mem_virt2memseg(addr, msl); - while (cur_len < len) { - /* some memory segments may have invalid IOVA */ - if (ms->iova == RTE_BAD_IOVA) { - RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", - ms->addr); - goto next; - } - if (type == RTE_MEM_EVENT_ALLOC) - vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, - ms->iova, ms->len, 1); - else - vfio_dma_mem_map(default_vfio_cfg, ms->addr_64, - ms->iova, ms->len, 0); -next: - cur_len += ms->len; - ++ms; - } -} - -static int -vfio_sync_default_container(void) -{ - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - int iommu_type_id; - unsigned int i; - - /* cannot be called from primary */ - if (rte_eal_process_type() != RTE_PROC_SECONDARY) - return -1; - - /* default container fd should have been opened in rte_vfio_enable() */ - if (!default_vfio_cfg->vfio_enabled || - default_vfio_cfg->vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, "VFIO support is not initialized\n"); - return -1; - } - - /* find default container's IOMMU type */ - p->req = SOCKET_REQ_IOMMU_TYPE; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - iommu_type_id = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK) - iommu_type_id = p->iommu_type_id; - free(mp_reply.msgs); - } - if (iommu_type_id < 0) { - RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n"); - return -1; - } - - /* we now have an fd for default container, as well as its IOMMU type. - * now, set up default VFIO container config to match. - */ - for (i = 0; i < RTE_DIM(iommu_types); i++) { - const struct vfio_iommu_type *t = &iommu_types[i]; - if (t->type_id != iommu_type_id) - continue; - - /* we found our IOMMU type */ - default_vfio_cfg->vfio_iommu_type = t; - - return 0; - } - RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n", - iommu_type_id); - return -1; -} - -int -rte_vfio_clear_group(int vfio_group_fd) -{ - int i; - struct vfio_config *vfio_cfg; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return -1; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0) - return -1; - vfio_cfg->vfio_groups[i].group_num = -1; - vfio_cfg->vfio_groups[i].fd = -1; - vfio_cfg->vfio_groups[i].devices = 0; - vfio_cfg->vfio_active_groups--; - - return 0; -} - -int -rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, - int *vfio_dev_fd, struct vfio_device_info *device_info) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; - struct vfio_config *vfio_cfg; - struct user_mem_maps *user_mem_maps; - int vfio_container_fd; - int vfio_group_fd; - int iommu_group_num; - int i, ret; - - /* get group number */ - ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); - if (ret == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - dev_addr); - return 1; - } - - /* if negative, something failed */ - if (ret < 0) - return -1; - - /* get the actual group fd */ - vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); - if (vfio_group_fd < 0) - return -1; - - /* if group_fd == 0, that means the device isn't managed by VFIO */ - if (vfio_group_fd == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - dev_addr); - return 1; - } - - /* - * at this point, we know that this group is viable (meaning, all devices - * are either bound to VFIO or not bound to anything) - */ - - /* check if the group is viable */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get group status, " - "error %i (%s)\n", dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - vfio_container_fd = vfio_cfg->vfio_container_fd; - user_mem_maps = &vfio_cfg->mem_maps; - - /* check if group does not have a container yet */ - if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { - - /* add group to a container */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, - &vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " - "error %i (%s)\n", dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* - * pick an IOMMU type and set up DMA mappings for container - * - * needs to be done only once, only when first group is - * assigned to a container and only in primary process. - * Note this can happen several times with the hotplug - * functionality. - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_cfg->vfio_active_groups == 1 && - vfio_group_device_count(vfio_group_fd) == 0) { - const struct vfio_iommu_type *t; - - /* select an IOMMU type which we will be using */ - t = vfio_set_iommu_type(vfio_container_fd); - if (!t) { - RTE_LOG(ERR, EAL, - " %s failed to select IOMMU type\n", - dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - /* lock memory hotplug before mapping and release it - * after registering callback, to prevent races - */ - rte_rwlock_read_lock(mem_lock); - if (vfio_cfg == default_vfio_cfg) - ret = t->dma_map_func(vfio_container_fd); - else - ret = 0; - if (ret) { - RTE_LOG(ERR, EAL, - " %s DMA remapping failed, error %i (%s)\n", - dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - rte_rwlock_read_unlock(mem_lock); - return -1; - } - - vfio_cfg->vfio_iommu_type = t; - - /* re-map all user-mapped segments */ - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* this IOMMU type may not support DMA mapping, but - * if we have mappings in the list - that means we have - * previously mapped something successfully, so we can - * be sure that DMA mapping is supported. - */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map; - map = &user_mem_maps->maps[i]; - - ret = t->dma_user_map_func( - vfio_container_fd, - map->addr, map->iova, map->len, - 1); - if (ret) { - RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " - "va: 0x%" PRIx64 " " - "iova: 0x%" PRIx64 " " - "len: 0x%" PRIu64 "\n", - map->addr, map->iova, - map->len); - rte_spinlock_recursive_unlock( - &user_mem_maps->lock); - rte_rwlock_read_unlock(mem_lock); - return -1; - } - } - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - - /* register callback for mem events */ - if (vfio_cfg == default_vfio_cfg) - ret = rte_mem_event_callback_register( - VFIO_MEM_EVENT_CLB_NAME, - vfio_mem_event_callback, NULL); - else - ret = 0; - /* unlock memory hotplug */ - rte_rwlock_read_unlock(mem_lock); - - if (ret && rte_errno != ENOTSUP) { - RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); - return -1; - } - if (ret) - RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); - else - RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); - } - } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && - vfio_cfg == default_vfio_cfg && - vfio_cfg->vfio_iommu_type == NULL) { - /* if we're not a primary process, we do not set up the VFIO - * container because it's already been set up by the primary - * process. instead, we simply ask the primary about VFIO type - * we are using, and set the VFIO config up appropriately. - */ - ret = vfio_sync_default_container(); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n"); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - /* we have successfully initialized VFIO, notify user */ - const struct vfio_iommu_type *t = - default_vfio_cfg->vfio_iommu_type; - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - } - - /* get a file descriptor for the device */ - *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); - if (*vfio_dev_fd < 0) { - /* if we cannot get a device fd, this implies a problem with - * the VFIO group or the container not having IOMMU configured. - */ - - RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", - dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* test and setup the device */ - ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get device info, " - "error %i (%s)\n", dev_addr, errno, - strerror(errno)); - close(*vfio_dev_fd); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - vfio_group_device_get(vfio_group_fd); - - return 0; -} - -int -rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, - int vfio_dev_fd) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock; - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; - struct vfio_config *vfio_cfg; - int vfio_group_fd; - int iommu_group_num; - int ret; - - /* we don't want any DMA mapping messages to come while we're detaching - * VFIO device, because this might be the last device and we might need - * to unregister the callback. - */ - rte_rwlock_read_lock(mem_lock); - - /* get group number */ - ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); - if (ret <= 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", - dev_addr); - /* This is an error at this point. */ - ret = -1; - goto out; - } - - /* get the actual group fd */ - vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); - if (vfio_group_fd <= 0) { - RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", - dev_addr); - ret = -1; - goto out; - } - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - - /* At this point we got an active group. Closing it will make the - * container detachment. If this is the last active group, VFIO kernel - * code will unset the container and the IOMMU mappings. - */ - - /* Closing a device */ - if (close(vfio_dev_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", - dev_addr); - ret = -1; - goto out; - } - - /* An VFIO group can have several devices attached. Just when there is - * no devices remaining should the group be closed. - */ - vfio_group_device_put(vfio_group_fd); - if (!vfio_group_device_count(vfio_group_fd)) { - - if (close(vfio_group_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", - dev_addr); - ret = -1; - goto out; - } - - if (rte_vfio_clear_group(vfio_group_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", - dev_addr); - ret = -1; - goto out; - } - } - - /* if there are no active device groups, unregister the callback to - * avoid spurious attempts to map/unmap memory from VFIO. - */ - if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && - rte_eal_process_type() != RTE_PROC_SECONDARY) - rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, - NULL); - - /* success */ - ret = 0; - -out: - rte_rwlock_read_unlock(mem_lock); - return ret; -} - -int -rte_vfio_enable(const char *modname) -{ - /* initialize group list */ - int i, j; - int vfio_available; - - rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfgs[i].vfio_container_fd = -1; - vfio_cfgs[i].vfio_active_groups = 0; - vfio_cfgs[i].vfio_iommu_type = NULL; - vfio_cfgs[i].mem_maps.lock = lock; - - for (j = 0; j < VFIO_MAX_GROUPS; j++) { - vfio_cfgs[i].vfio_groups[j].fd = -1; - vfio_cfgs[i].vfio_groups[j].group_num = -1; - vfio_cfgs[i].vfio_groups[j].devices = 0; - } - } - - /* inform the user that we are probing for VFIO */ - RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); - - /* check if vfio module is loaded */ - vfio_available = rte_eal_check_module(modname); - - /* return error directly */ - if (vfio_available == -1) { - RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); - return -1; - } - - /* return 0 if VFIO modules not loaded */ - if (vfio_available == 0) { - RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " - "skipping VFIO support...\n"); - return 0; - } - - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* open a new container */ - default_vfio_cfg->vfio_container_fd = - rte_vfio_get_container_fd(); - } else { - /* get the default container from the primary process */ - default_vfio_cfg->vfio_container_fd = - vfio_get_default_container_fd(); - } - - /* check if we have VFIO driver enabled */ - if (default_vfio_cfg->vfio_container_fd != -1) { - RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); - default_vfio_cfg->vfio_enabled = 1; - } else { - RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); - } - - return 0; -} - -int -rte_vfio_is_enabled(const char *modname) -{ - const int mod_available = rte_eal_check_module(modname) > 0; - return default_vfio_cfg->vfio_enabled && mod_available; -} - -int -vfio_get_default_container_fd(void) -{ - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - if (default_vfio_cfg->vfio_enabled) - return default_vfio_cfg->vfio_container_fd; - - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* if we were secondary process we would try requesting - * container fd from the primary, but we're the primary - * process so just exit here - */ - return -1; - } - - p->req = SOCKET_REQ_DEFAULT_CONTAINER; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - free(mp_reply.msgs); - return mp_rep->fds[0]; - } - free(mp_reply.msgs); - } - - RTE_LOG(ERR, EAL, " cannot request default container fd\n"); - return -1; -} - -int -vfio_get_iommu_type(void) -{ - if (default_vfio_cfg->vfio_iommu_type == NULL) - return -1; - - return default_vfio_cfg->vfio_iommu_type->type_id; -} - -const struct vfio_iommu_type * -vfio_set_iommu_type(int vfio_container_fd) -{ - unsigned idx; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - t->type_id); - if (!ret) { - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - return t; - } - /* not an error, there may be more supported IOMMU types */ - RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " - "error %i (%s)\n", t->type_id, t->name, errno, - strerror(errno)); - } - /* if we didn't find a suitable IOMMU type, fail */ - return NULL; -} - -int -vfio_has_supported_extensions(int vfio_container_fd) -{ - int ret; - unsigned idx, n_extensions = 0; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, - t->type_id); - if (ret < 0) { - RTE_LOG(ERR, EAL, " could not get IOMMU type, " - "error %i (%s)\n", errno, - strerror(errno)); - close(vfio_container_fd); - return -1; - } else if (ret == 1) { - /* we found a supported extension */ - n_extensions++; - } - RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", - t->type_id, t->name, - ret ? "supported" : "not supported"); - } - - /* if we didn't find any supported IOMMU types, fail */ - if (!n_extensions) { - close(vfio_container_fd); - return -1; - } - - return 0; -} - -int -rte_vfio_get_container_fd(void) -{ - int ret, vfio_container_fd; - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - - /* if we're in a primary process, try to open the container */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot open VFIO container, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* check VFIO API version */ - ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - if (ret < 0) - RTE_LOG(ERR, EAL, " could not get VFIO API version, " - "error %i (%s)\n", errno, strerror(errno)); - else - RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); - close(vfio_container_fd); - return -1; - } - - ret = vfio_has_supported_extensions(vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " no supported IOMMU " - "extensions found!\n"); - return -1; - } - - return vfio_container_fd; - } - /* - * if we're in a secondary process, request container fd from the - * primary process via mp channel - */ - p->req = SOCKET_REQ_CONTAINER; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - vfio_container_fd = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - vfio_container_fd = mp_rep->fds[0]; - free(mp_reply.msgs); - return vfio_container_fd; - } - free(mp_reply.msgs); - } - - RTE_LOG(ERR, EAL, " cannot request container fd\n"); - return -1; -} - -int -rte_vfio_get_group_num(const char *sysfs_base, - const char *dev_addr, int *iommu_group_num) -{ - char linkname[PATH_MAX]; - char filename[PATH_MAX]; - char *tok[16], *group_tok, *end; - int ret; - - memset(linkname, 0, sizeof(linkname)); - memset(filename, 0, sizeof(filename)); - - /* try to find out IOMMU group for this device */ - snprintf(linkname, sizeof(linkname), - "%s/%s/iommu_group", sysfs_base, dev_addr); - - ret = readlink(linkname, filename, sizeof(filename)); - - /* if the link doesn't exist, no VFIO for us */ - if (ret < 0) - return 0; - - ret = rte_strsplit(filename, sizeof(filename), - tok, RTE_DIM(tok), '/'); - - if (ret <= 0) { - RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); - return -1; - } - - /* IOMMU group is always the last token */ - errno = 0; - group_tok = tok[ret - 1]; - end = group_tok; - *iommu_group_num = strtol(group_tok, &end, 10); - if ((end != group_tok && *end != '\0') || errno != 0) { - RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); - return -1; - } - - return 1; -} - -static int -type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - void *arg) -{ - int *vfio_container_fd = arg; - - if (msl->external) - return 0; - - return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, - ms->len, 1); -} - -static int -vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct vfio_iommu_type1_dma_map dma_map; - struct vfio_iommu_type1_dma_unmap dma_unmap; - int ret; - - if (do_map != 0) { - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = vaddr; - dma_map.size = len; - dma_map.iova = iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } else { - memset(&dma_unmap, 0, sizeof(dma_unmap)); - dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); - dma_unmap.size = len; - dma_unmap.iova = iova; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, - &dma_unmap); - if (ret) { - RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -static int -vfio_type1_dma_map(int vfio_container_fd) -{ - return rte_memseg_walk(type1_map, &vfio_container_fd); -} - -static int -vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct vfio_iommu_type1_dma_map dma_map; - struct vfio_iommu_type1_dma_unmap dma_unmap; - int ret; - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 - }; - reg.vaddr = (uintptr_t) vaddr; - reg.size = len; - - if (do_map != 0) { - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = vaddr; - dma_map.size = len; - dma_map.iova = iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - if (ret) { - RTE_LOG(ERR, EAL, " cannot set up DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - } else { - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - memset(&dma_unmap, 0, sizeof(dma_unmap)); - dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); - dma_unmap.size = len; - dma_unmap.iova = iova; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, - &dma_unmap); - if (ret) { - RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -static int -vfio_spapr_map_walk(const struct rte_memseg_list *msl, - const struct rte_memseg *ms, void *arg) -{ - int *vfio_container_fd = arg; - - if (msl->external) - return 0; - - return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, - ms->len, 1); -} - -struct spapr_walk_param { - uint64_t window_size; - uint64_t hugepage_sz; -}; -static int -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, - const struct rte_memseg *ms, void *arg) -{ - struct spapr_walk_param *param = arg; - uint64_t max = ms->iova + ms->len; - - if (msl->external) - return 0; - - if (max > param->window_size) { - param->hugepage_sz = ms->hugepage_sz; - param->window_size = max; - } - - return 0; -} - -static int -vfio_spapr_create_new_dma_window(int vfio_container_fd, - struct vfio_iommu_spapr_tce_create *create) { - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - }; - struct vfio_iommu_spapr_tce_info info = { - .argsz = sizeof(info), - }; - int ret; - - /* query spapr iommu info */ - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); - if (ret) { - RTE_LOG(ERR, EAL, " cannot get iommu info, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* remove default DMA of 32 bit window */ - remove.start_addr = info.dma32_window_start; - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); - if (ret) { - RTE_LOG(ERR, EAL, " cannot remove default DMA window, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* create new DMA window */ - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); - if (ret) { - RTE_LOG(ERR, EAL, " cannot create new DMA window, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - if (create->start_addr != 0) { - RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); - return -1; - } - - return 0; -} - -static int -vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct spapr_walk_param param; - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct vfio_config *vfio_cfg; - struct user_mem_maps *user_mem_maps; - int i, ret = 0; - - vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid container fd!\n"); - return -1; - } - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* check if window size needs to be adjusted */ - memset(¶m, 0, sizeof(param)); - - /* we're inside a callback so use thread-unsafe version */ - if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, - ¶m) < 0) { - RTE_LOG(ERR, EAL, "Could not get window size\n"); - ret = -1; - goto out; - } - - /* also check user maps */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - uint64_t max = user_mem_maps->maps[i].iova + - user_mem_maps->maps[i].len; - create.window_size = RTE_MAX(create.window_size, max); - } - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(param.window_size); - create.page_shift = __builtin_ctzll(param.hugepage_sz); - create.levels = 1; - - if (do_map) { - void *addr; - /* re-create window and remap the entire memory */ - if (iova > create.window_size) { - if (vfio_spapr_create_new_dma_window(vfio_container_fd, - &create) < 0) { - RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); - ret = -1; - goto out; - } - /* we're inside a callback, so use thread-unsafe version - */ - if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, - &vfio_container_fd) < 0) { - RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); - ret = -1; - goto out; - } - /* remap all user maps */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map = - &user_mem_maps->maps[i]; - if (vfio_spapr_dma_do_map(vfio_container_fd, - map->addr, map->iova, map->len, - 1)) { - RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); - ret = -1; - goto out; - } - } - } - - /* now that we've remapped all of the memory that was present - * before, map the segment that we were requested to map. - * - * however, if we were called by the callback, the memory we - * were called with was already in the memseg list, so previous - * mapping should've mapped that segment already. - * - * virt2memseg_list is a relatively cheap check, so use that. if - * memory is within any memseg list, it's a memseg, so it's - * already mapped. - */ - addr = (void *)(uintptr_t)vaddr; - if (rte_mem_virt2memseg_list(addr) == NULL && - vfio_spapr_dma_do_map(vfio_container_fd, - vaddr, iova, len, 1) < 0) { - RTE_LOG(ERR, EAL, "Could not map segment\n"); - ret = -1; - goto out; - } - } else { - /* for unmap, check if iova within DMA window */ - if (iova > create.window_size) { - RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); - ret = -1; - goto out; - } - - vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); - } -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -static int -vfio_spapr_dma_map(int vfio_container_fd) -{ - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct spapr_walk_param param; - - memset(¶m, 0, sizeof(param)); - - /* create DMA window from 0 to max(phys_addr + len) */ - rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(param.window_size); - create.page_shift = __builtin_ctzll(param.hugepage_sz); - create.levels = 1; - - if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { - RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); - return -1; - } - - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) - return -1; - - return 0; -} - -static int -vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - -static int -vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, - uint64_t __rte_unused vaddr, - uint64_t __rte_unused iova, uint64_t __rte_unused len, - int __rte_unused do_map) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - -static int -vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; - - if (!t) { - RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); - rte_errno = ENODEV; - return -1; - } - - if (!t->dma_user_map_func) { - RTE_LOG(ERR, EAL, - " VFIO custom DMA region maping not supported by IOMMU %s\n", - t->name); - rte_errno = ENOTSUP; - return -1; - } - - return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, - len, do_map); -} - -static int -container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct user_mem_map *new_map; - struct user_mem_maps *user_mem_maps; - int ret = 0; - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { - RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); - rte_errno = ENOMEM; - ret = -1; - goto out; - } - /* map the entry */ - if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { - /* technically, this will fail if there are currently no devices - * plugged in, even if a device were added later, this mapping - * might have succeeded. however, since we cannot verify if this - * is a valid mapping without having a device attached, consider - * this to be unsupported, because we can't just store any old - * mapping and pollute list of active mappings willy-nilly. - */ - RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); - ret = -1; - goto out; - } - /* create new user mem map entry */ - new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; - new_map->addr = vaddr; - new_map->iova = iova; - new_map->len = len; - - compact_user_maps(user_mem_maps); -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -static int -container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct user_mem_map *map, *new_map = NULL; - struct user_mem_maps *user_mem_maps; - int ret = 0; - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* find our mapping */ - map = find_user_mem_map(user_mem_maps, vaddr, iova, len); - if (!map) { - RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); - rte_errno = EINVAL; - ret = -1; - goto out; - } - if (map->addr != vaddr || map->iova != iova || map->len != len) { - /* we're partially unmapping a previously mapped region, so we - * need to split entry into two. - */ - if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { - RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); - rte_errno = ENOMEM; - ret = -1; - goto out; - } - new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; - } - - /* unmap the entry */ - if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { - /* there may not be any devices plugged in, so unmapping will - * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't - * stop us from removing the mapping, as the assumption is we - * won't be needing this memory any more and thus will want to - * prevent it from being remapped again on hotplug. so, only - * fail if we indeed failed to unmap (e.g. if the mapping was - * within our mapped range but had invalid alignment). - */ - if (rte_errno != ENODEV && rte_errno != ENOTSUP) { - RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); - ret = -1; - goto out; - } else { - RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); - } - } - /* remove map from the list of active mappings */ - if (new_map != NULL) { - adjust_map(map, new_map, vaddr, len); - - /* if we've created a new map by splitting, sort everything */ - if (!is_null_map(new_map)) { - compact_user_maps(user_mem_maps); - } else { - /* we've created a new mapping, but it was unused */ - user_mem_maps->n_maps--; - } - } else { - memset(map, 0, sizeof(*map)); - compact_user_maps(user_mem_maps); - user_mem_maps->n_maps--; - } - -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -int -rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len) -{ - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - return container_dma_map(default_vfio_cfg, vaddr, iova, len); -} - -int -rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len) -{ - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - return container_dma_unmap(default_vfio_cfg, vaddr, iova, len); -} - -int -rte_vfio_noiommu_is_enabled(void) -{ - int fd; - ssize_t cnt; - char c; - - fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); - if (fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", - errno, strerror(errno)); - return -1; - } - /* - * else the file does not exists - * i.e. noiommu is not enabled - */ - return 0; - } - - cnt = read(fd, &c, 1); - close(fd); - if (cnt != 1) { - RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " - "file %i (%s)\n", errno, strerror(errno)); - return -1; - } - - return c == 'Y'; -} - -int -rte_vfio_container_create(void) -{ - int i; - - /* Find an empty slot to store new vfio config */ - for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == -1) - break; - } - - if (i == VFIO_MAX_CONTAINERS) { - RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); - return -1; - } - - vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); - if (vfio_cfgs[i].vfio_container_fd < 0) { - RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); - return -1; - } - - return vfio_cfgs[i].vfio_container_fd; -} - -int __rte_experimental -rte_vfio_container_destroy(int container_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num != -1) - rte_vfio_container_group_unbind(container_fd, - vfio_cfg->vfio_groups[i].group_num); - - close(container_fd); - vfio_cfg->vfio_container_fd = -1; - vfio_cfg->vfio_active_groups = 0; - vfio_cfg->vfio_iommu_type = NULL; - - return 0; -} - -int -rte_vfio_container_group_bind(int container_fd, int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return vfio_get_group_fd(vfio_cfg, iommu_group_num); -} - -int -rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp = NULL; - int i; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - for (i = 0; i < VFIO_MAX_GROUPS; i++) { - if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { - RTE_LOG(ERR, EAL, "Specified group number not found\n"); - return -1; - } - - if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { - RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" - " iommu_group_num %d\n", iommu_group_num); - return -1; - } - cur_grp->group_num = -1; - cur_grp->fd = -1; - cur_grp->devices = 0; - vfio_cfg->vfio_active_groups--; - - return 0; -} - -int -rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct vfio_config *vfio_cfg; - - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return container_dma_map(vfio_cfg, vaddr, iova, len); -} - -int -rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct vfio_config *vfio_cfg; - - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return container_dma_unmap(vfio_cfg, vaddr, iova, len); -} - -#else - -int -rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -int -rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova, - __rte_unused uint64_t len) -{ - return -1; -} - -int -rte_vfio_setup_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *vfio_dev_fd, - __rte_unused struct vfio_device_info *device_info) -{ - return -1; -} - -int -rte_vfio_release_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, __rte_unused int fd) -{ - return -1; -} - -int -rte_vfio_enable(__rte_unused const char *modname) -{ - return -1; -} - -int -rte_vfio_is_enabled(__rte_unused const char *modname) -{ - return -1; -} - -int -rte_vfio_noiommu_is_enabled(void) -{ - return -1; -} - -int -rte_vfio_clear_group(__rte_unused int vfio_group_fd) -{ - return -1; -} - -int -rte_vfio_get_group_num(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *iommu_group_num) -{ - return -1; -} - -int -rte_vfio_get_container_fd(void) -{ - return -1; -} - -int -rte_vfio_get_group_fd(__rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_create(void) -{ - return -1; -} - -int -rte_vfio_container_destroy(__rte_unused int container_fd) -{ - return -1; -} - -int -rte_vfio_container_group_bind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_group_unbind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_dma_map(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -int -rte_vfio_container_dma_unmap(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -#endif /* VFIO_PRESENT */ diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h deleted file mode 100644 index cb2d35fb12..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#ifndef EAL_VFIO_H_ -#define EAL_VFIO_H_ - -#include - -/* - * determine if VFIO is present on the system - */ -#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) -#define VFIO_PRESENT -#else -#pragma message("VFIO configured but not supported by this kernel, disabling.") -#endif /* kernel version >= 3.6.0 */ -#endif /* RTE_EAL_VFIO */ - -#ifdef VFIO_PRESENT - -#include -#include - -#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU - -#ifndef VFIO_SPAPR_TCE_v2_IOMMU -#define RTE_VFIO_SPAPR 7 -#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) -#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) -#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) -#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) - -struct vfio_iommu_spapr_register_memory { - uint32_t argsz; - uint32_t flags; - uint64_t vaddr; - uint64_t size; -}; - -struct vfio_iommu_spapr_tce_create { - uint32_t argsz; - uint32_t flags; - /* in */ - uint32_t page_shift; - uint32_t __resv1; - uint64_t window_size; - uint32_t levels; - uint32_t __resv2; - /* out */ - uint64_t start_addr; -}; - -struct vfio_iommu_spapr_tce_remove { - uint32_t argsz; - uint32_t flags; - /* in */ - uint64_t start_addr; -}; - -struct vfio_iommu_spapr_tce_ddw_info { - uint64_t pgsizes; - uint32_t max_dynamic_windows_supported; - uint32_t levels; -}; - -/* SPAPR_v2 is not present, but SPAPR might be */ -#ifndef VFIO_SPAPR_TCE_IOMMU -#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) - -struct vfio_iommu_spapr_tce_info { - uint32_t argsz; - uint32_t flags; - uint32_t dma32_window_start; - uint32_t dma32_window_size; - struct vfio_iommu_spapr_tce_ddw_info ddw; -}; -#endif /* VFIO_SPAPR_TCE_IOMMU */ - -#else /* VFIO_SPAPR_TCE_v2_IOMMU */ -#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU -#endif - -#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS -#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS - -/* - * we don't need to store device fd's anywhere since they can be obtained from - * the group fd via an ioctl() call. - */ -struct vfio_group { - int group_num; - int fd; - int devices; -}; - -/* DMA mapping function prototype. - * Takes VFIO container fd as a parameter. - * Returns 0 on success, -1 on error. - * */ -typedef int (*vfio_dma_func_t)(int); - -/* Custom memory region DMA mapping function prototype. - * Takes VFIO container fd, virtual address, phisical address, length and - * operation type (0 to unmap 1 for map) as a parameters. - * Returns 0 on success, -1 on error. - **/ -typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map); - -struct vfio_iommu_type { - int type_id; - const char *name; - vfio_dma_user_func_t dma_user_map_func; - vfio_dma_func_t dma_map_func; -}; - -/* get the vfio container that devices are bound to by default */ -int vfio_get_default_container_fd(void); - -/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ -const struct vfio_iommu_type * -vfio_set_iommu_type(int vfio_container_fd); - -int -vfio_get_iommu_type(void); - -/* check if we have any supported extensions */ -int -vfio_has_supported_extensions(int vfio_container_fd); - -int vfio_mp_sync_setup(void); - -#define EAL_VFIO_MP "eal_vfio_mp_sync" - -#define SOCKET_REQ_CONTAINER 0x100 -#define SOCKET_REQ_GROUP 0x200 -#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 -#define SOCKET_REQ_IOMMU_TYPE 0x800 -#define SOCKET_OK 0x0 -#define SOCKET_NO_FD 0x1 -#define SOCKET_ERR 0xFF - -struct vfio_mp_param { - int req; - int result; - RTE_STD_C11 - union { - int group_num; - int iommu_type_id; - }; -}; - -#endif /* VFIO_PRESENT */ - -#endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c deleted file mode 100644 index 2a47f29d5a..0000000000 --- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c +++ /dev/null @@ -1,119 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include - -#include -#include -#include -#include - -#include "eal_vfio.h" - -/** - * @file - * VFIO socket for communication between primary and secondary processes. - * - * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". - */ - -#ifdef VFIO_PRESENT - -static int -vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) -{ - int fd = -1; - int ret; - struct rte_mp_msg reply; - struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; - const struct vfio_mp_param *m = - (const struct vfio_mp_param *)msg->param; - - if (msg->len_param != sizeof(*m)) { - RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); - return -1; - } - - memset(&reply, 0, sizeof(reply)); - - switch (m->req) { - case SOCKET_REQ_GROUP: - r->req = SOCKET_REQ_GROUP; - r->group_num = m->group_num; - fd = rte_vfio_get_group_fd(m->group_num); - if (fd < 0) - r->result = SOCKET_ERR; - else if (fd == 0) - /* if VFIO group exists but isn't bound to VFIO driver */ - r->result = SOCKET_NO_FD; - else { - /* if group exists and is bound to VFIO driver */ - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_CONTAINER: - r->req = SOCKET_REQ_CONTAINER; - fd = rte_vfio_get_container_fd(); - if (fd < 0) - r->result = SOCKET_ERR; - else { - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_DEFAULT_CONTAINER: - r->req = SOCKET_REQ_DEFAULT_CONTAINER; - fd = vfio_get_default_container_fd(); - if (fd < 0) - r->result = SOCKET_ERR; - else { - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_IOMMU_TYPE: - { - int iommu_type_id; - - r->req = SOCKET_REQ_IOMMU_TYPE; - - iommu_type_id = vfio_get_iommu_type(); - - if (iommu_type_id < 0) - r->result = SOCKET_ERR; - else { - r->iommu_type_id = iommu_type_id; - r->result = SOCKET_OK; - } - break; - } - default: - RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); - return -1; - } - - strcpy(reply.name, EAL_VFIO_MP); - reply.len_param = sizeof(*r); - - ret = rte_mp_reply(&reply, peer); - if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) - close(fd); - return ret; -} - -int -vfio_mp_sync_setup(void) -{ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); - - return 0; -} - -#endif diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h deleted file mode 100644 index 5afa087131..0000000000 --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ -/* - * Copyright(c) 2007-2014 Intel Corporation. - */ - -#ifndef _RTE_KNI_COMMON_H_ -#define _RTE_KNI_COMMON_H_ - -#ifdef __KERNEL__ -#include -#include -#define RTE_STD_C11 -#else -#include -#include -#endif - -/** - * KNI name is part of memzone name. - */ -#define RTE_KNI_NAMESIZE 32 - -#define RTE_CACHE_LINE_MIN_SIZE 64 - -/* - * Request id. - */ -enum rte_kni_req_id { - RTE_KNI_REQ_UNKNOWN = 0, - RTE_KNI_REQ_CHANGE_MTU, - RTE_KNI_REQ_CFG_NETWORK_IF, - RTE_KNI_REQ_CHANGE_MAC_ADDR, - RTE_KNI_REQ_CHANGE_PROMISC, - RTE_KNI_REQ_MAX, -}; - -/* - * Structure for KNI request. - */ -struct rte_kni_request { - uint32_t req_id; /**< Request id */ - RTE_STD_C11 - union { - uint32_t new_mtu; /**< New MTU */ - uint8_t if_up; /**< 1: interface up, 0: interface down */ - uint8_t mac_addr[6]; /**< MAC address for interface */ - uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ - }; - int32_t result; /**< Result for processing request */ -} __attribute__((__packed__)); - -/* - * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO - * Write and read should wrap around. Fifo is empty when write == read - * Writing should never overwrite the read position - */ -struct rte_kni_fifo { -#ifdef RTE_USE_C11_MEM_MODEL - unsigned write; /**< Next position to be written*/ - unsigned read; /**< Next position to be read */ -#else - volatile unsigned write; /**< Next position to be written*/ - volatile unsigned read; /**< Next position to be read */ -#endif - unsigned len; /**< Circular buffer length */ - unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ - void *volatile buffer[]; /**< The buffer contains mbuf pointers */ -}; - -/* - * The kernel image of the rte_mbuf struct, with only the relevant fields. - * Padding is necessary to assure the offsets of these fields - */ -struct rte_kni_mbuf { - void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); - uint64_t buf_physaddr; - uint16_t data_off; /**< Start address of data in segment buffer. */ - char pad1[2]; - uint16_t nb_segs; /**< Number of segments. */ - char pad4[2]; - uint64_t ol_flags; /**< Offload features. */ - char pad2[4]; - uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ - uint16_t data_len; /**< Amount of data in segment buffer. */ - - /* fields on second cache line */ - char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); - void *pool; - void *next; -}; - -/* - * Struct used to create a KNI device. Passed to the kernel in IOCTL call - */ - -struct rte_kni_device_info { - char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ - - phys_addr_t tx_phys; - phys_addr_t rx_phys; - phys_addr_t alloc_phys; - phys_addr_t free_phys; - - /* Used by Ethtool */ - phys_addr_t req_phys; - phys_addr_t resp_phys; - phys_addr_t sync_phys; - void * sync_va; - - /* mbuf mempool */ - void * mbuf_va; - phys_addr_t mbuf_phys; - - /* PCI info */ - uint16_t vendor_id; /**< Vendor ID or PCI_ANY_ID. */ - uint16_t device_id; /**< Device ID or PCI_ANY_ID. */ - uint8_t bus; /**< Device bus */ - uint8_t devid; /**< Device ID */ - uint8_t function; /**< Device function. */ - - uint16_t group_id; /**< Group ID */ - uint32_t core_id; /**< core ID to bind for kernel thread */ - - __extension__ - uint8_t force_bind : 1; /**< Flag for kernel thread binding */ - - /* mbuf size */ - unsigned mbuf_size; - unsigned int mtu; - char mac_addr[6]; -}; - -#define KNI_DEVICE "kni" - -#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) -#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) -#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) - -#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build deleted file mode 100644 index 7e68b2c0dd..0000000000 --- a/lib/librte_eal/linuxapp/eal/meson.build +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation - -eal_inc += include_directories('include') -install_subdir('include/exec-env', install_dir: get_option('includedir')) - -env_objs = [] -env_headers = [] -env_sources = files('eal_alarm.c', - 'eal_cpuflags.c', - 'eal_debug.c', - 'eal_hugepage_info.c', - 'eal_interrupts.c', - 'eal_memalloc.c', - 'eal_lcore.c', - 'eal_log.c', - 'eal_thread.c', - 'eal_timer.c', - 'eal_vfio.c', - 'eal_vfio_mp_sync.c', - 'eal.c', - 'eal_memory.c', - 'eal_dev.c', -) - -deps += ['kvargs'] -if has_libnuma == 1 - dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) -endif diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build index 4ae0efccbc..cb8d1094fc 100644 --- a/lib/librte_eal/meson.build +++ b/lib/librte_eal/meson.build @@ -11,7 +11,7 @@ subdir('common') # defines common_sources, common_objs, etc. # The /eal/meson.build file should define env_sources, etc. if host_machine.system() == 'linux' dpdk_conf.set('RTE_EXEC_ENV_LINUXAPP', 1) - subdir('linuxapp/eal') + subdir('linux/eal') elif host_machine.system() == 'freebsd' dpdk_conf.set('RTE_EXEC_ENV_BSDAPP', 1)