From: Thomas Monjalon Date: Fri, 27 Mar 2020 01:15:39 +0000 (+0100) Subject: eal: move OS-specific sub-directories X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=a083f8cc77460c15ac99a427ab6833dc8c8ae5bc;p=dpdk.git eal: move OS-specific sub-directories Since the kernel modules are moved to kernel/ directory, there is no need anymore for the sub-directory eal/ in linux/, freebsd/ and windows/. Signed-off-by: Thomas Monjalon Acked-by: David Marchand --- diff --git a/MAINTAINERS b/MAINTAINERS index 8ce8d02a4c..4800f6884a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -203,8 +203,8 @@ F: lib/librte_eal/common/*malloc* F: lib/librte_eal/common/eal_common_fbarray.c F: lib/librte_eal/common/eal_common_mem* F: lib/librte_eal/common/eal_hugepages.h -F: lib/librte_eal/linux/eal/eal_mem* -F: lib/librte_eal/freebsd/eal/eal_mem* +F: lib/librte_eal/linux/eal_mem* +F: lib/librte_eal/freebsd/eal_mem* F: doc/guides/prog_guide/env_abstraction_layer.rst F: app/test/test_external_mem.c F: app/test/test_func_reentrancy.c @@ -289,8 +289,7 @@ M: Konstantin Ananyev F: lib/librte_eal/x86/ Linux EAL (with overlaps) -F: lib/librte_eal/linux/Makefile -F: lib/librte_eal/linux/eal/ +F: lib/librte_eal/linux/ F: doc/guides/linux_gsg/ Linux UIO @@ -300,13 +299,12 @@ F: drivers/bus/pci/linux/*uio* Linux VFIO M: Anatoly Burakov -F: lib/librte_eal/linux/eal/*vfio* +F: lib/librte_eal/linux/*vfio* F: drivers/bus/pci/linux/*vfio* FreeBSD EAL (with overlaps) M: Bruce Richardson -F: lib/librte_eal/freebsd/Makefile -F: lib/librte_eal/freebsd/eal/ +F: lib/librte_eal/freebsd/ F: doc/guides/freebsd_gsg/ FreeBSD contigmem diff --git a/kernel/linux/kni/meson.build b/kernel/linux/kni/meson.build index 706bea5b7f..d696347f22 100644 --- a/kernel/linux/kni/meson.build +++ b/kernel/linux/kni/meson.build @@ -18,7 +18,7 @@ custom_target('rte_kni', 'src=' + meson.current_source_dir(), 'MODULE_CFLAGS=-include ' + meson.source_root() + '/config/rte_config.h' + ' -I' + meson.source_root() + '/lib/librte_eal/include' + - ' -I' + meson.source_root() + '/lib/librte_eal/linux/eal/include' + + ' -I' + meson.source_root() + '/lib/librte_eal/linux/include' + ' -I' + meson.build_root() + ' -I' + meson.current_source_dir(), 'modules'], diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile index ff74935932..2fda40d230 100644 --- a/lib/librte_eal/Makefile +++ b/lib/librte_eal/Makefile @@ -4,9 +4,9 @@ include $(RTE_SDK)/mk/rte.vars.mk DIRS-y += include -DIRS-$(CONFIG_RTE_EXEC_ENV_LINUX) += linux/eal +DIRS-$(CONFIG_RTE_EXEC_ENV_LINUX) += linux DEPDIRS-linux := include -DIRS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += freebsd/eal +DIRS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += freebsd DEPDIRS-freebsd := include include $(RTE_SDK)/mk/rte.subdir.mk diff --git a/lib/librte_eal/freebsd/Makefile b/lib/librte_eal/freebsd/Makefile new file mode 100644 index 0000000000..952f4f1c81 --- /dev/null +++ b/lib/librte_eal/freebsd/Makefile @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2019 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) +VPATH += $(RTE_SDK)/lib/librte_eal/common +VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -lexecinfo +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrte_kvargs + +EXPORT_MAP := ../rte_eal_version.map + +# specific to freebsd exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_mcfg.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += hotplug_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_option.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_random.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +CFLAGS_eal_hpet.o += -Wno-return-type +endif + +INC := rte_os.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_FREEBSD)-include := $(addprefix include/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/freebsd/eal.c b/lib/librte_eal/freebsd/eal.c new file mode 100644 index 0000000000..6ae37e7e69 --- /dev/null +++ b/lib/librte_eal/freebsd/eal.c @@ -0,0 +1,1092 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" +#include "eal_memcfg.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, eal_get_hugefile_prefix()); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +int +eal_clean_runtime_dir(void) +{ + /* FreeBSD doesn't need this implemented for now, because, unlike Linux, + * FreeBSD doesn't create per-process files, so no need to clean up. + */ + return 0; +} + + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static int +rte_eal_config_create(void) +{ + size_t page_sz = sysconf(_SC_PAGE_SIZE); + size_t cfg_len = sizeof(*rte_config.mem_config); + size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); + void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + /* map the config before base address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), page_sz); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + retval = ftruncate(mem_cfg_fd, cfg_len); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", + pathname); + return -1; + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + return -1; + } + + /* reserve space for config */ + rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, + &cfg_len_aligned, page_sz, 0, 0); + if (rte_mem_cfg_addr == NULL) { + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + /* remap the actual file into the space we've just reserved */ + mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, + cfg_len_aligned, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); + if (mapped_mem_cfg_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); + munmap(rte_mem_cfg_addr, cfg_len); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location + */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + + return 0; +} + +/* attach to an existing shared memory config */ +static int +rte_eal_config_attach(void) +{ + void *rte_mem_cfg_addr; + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + /* don't close the fd here, it will be closed on reattach */ + if (rte_mem_cfg_addr == MAP_FAILED) { + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = rte_mem_cfg_addr; + + return 0; +} + +/* reattach the shared config at exact memory location primary process has it */ +static int +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return 0; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = + (void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + close(mem_cfg_fd); + mem_cfg_fd = -1; + + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) { + /* errno is stale, don't use */ + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--" OPT_BASE_VIRTADDR + "' option\n", + rte_mem_cfg_addr, mem_config); + munmap(mem_config, sizeof(struct rte_mem_config)); + return -1; + } + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static int +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + if (rte_eal_config_create() < 0) + return -1; + eal_mcfg_update_from_internal(); + break; + case RTE_PROC_SECONDARY: + if (rte_eal_config_attach() < 0) + return -1; + eal_mcfg_wait_complete(); + if (eal_mcfg_check_version() < 0) { + RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); + return -1; + } + if (rte_eal_config_reattach() < 0) + return -1; + eal_mcfg_update_internal(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + RTE_LOG(ERR, EAL, "Invalid process type %d\n", + rte_config.process_type); + return -1; + } + + return 0; +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + const int old_optreset = optreset; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + optreset = 1; + opterr = 0; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ + if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case OPT_MBUF_POOL_OPS_NAME_NUM: + { + char *ops_name = strdup(optarg); + if (ops_name == NULL) + RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); + else { + /* free old ops name */ + if (internal_config.user_mbuf_pool_ops_name != + NULL) + free(internal_config.user_mbuf_pool_ops_name); + + internal_config.user_mbuf_pool_ops_name = + ops_name; + } + break; + } + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on FreeBSD\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on FreeBSD\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on FreeBSD\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optreset = old_optreset; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->external) + return 0; + + if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) + return 1; + + return 0; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +/* return non-zero if hugepages are enabled. */ +int rte_eal_has_hugepages(void) +{ + return !internal_config.no_hugetlbfs; +} + +/* Abstraction for port I/0 privilege */ +int +rte_eal_iopl_init(void) +{ + static int fd = -1; + + if (fd < 0) + fd = open("/dev/io", O_RDWR); + + if (fd < 0) + return -1; + /* keep fd open for iopl */ + return 0; +} + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + /* FreeBSD always uses legacy memory model */ + internal_config.legacy_mem = true; + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + if (rte_config_init() < 0) { + rte_eal_init_alert("Cannot init config"); + return -1; + } + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init alarm"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { + rte_eal_init_alert("failed to init mp channel"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); + + if (iova_mode == RTE_IOVA_DC) + iova_mode = RTE_IOVA_PA; + rte_eal_get_configuration()->iova_mode = iova_mode; + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } + + RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", + rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + else + internal_config.memory = eal_get_hugepage_mem_size(); + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory"); + rte_errno = ENOMEM; + return -1; + } + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + rte_config.master_lcore, thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + rte_thread_setname(lcore_config[i].thread_id, thread_name); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices"); + rte_errno = ENOTSUP; + return -1; + } + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + /* + * Clean up unused files in runtime directory. We do this at the end of + * init and not at the beginning because we want to clean stuff up + * whether we are primary or secondary process, but we cannot remove + * primary process' files because secondary should be able to run even + * if primary process is dead. + * + * In no_shconf mode, no runtime directory is created in the first + * place, so no cleanup needed. + */ + if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { + rte_eal_init_alert("Cannot clear runtime directory\n"); + return -1; + } + + eal_mcfg_complete(); + + /* Call each registered callback, if enabled */ + rte_option_init(); + + return fctret; +} + +int +rte_eal_cleanup(void) +{ + rte_service_finalize(); + rte_mp_channel_cleanup(); + eal_cleanup_config(&internal_config); + return 0; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return RTE_INTR_MODE_NONE; +} + +int rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int fd) +{ + return -1; +} + +int rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return 0; +} + +int rte_vfio_noiommu_is_enabled(void) +{ + return 0; +} + +int rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return 0; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} diff --git a/lib/librte_eal/freebsd/eal/Makefile b/lib/librte_eal/freebsd/eal/Makefile deleted file mode 100644 index 0c809d9872..0000000000 --- a/lib/librte_eal/freebsd/eal/Makefile +++ /dev/null @@ -1,93 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2010-2019 Intel Corporation - -include $(RTE_SDK)/mk/rte.vars.mk - -LIB = librte_eal.a - -ARCH_DIR ?= $(RTE_ARCH) -VPATH += $(RTE_SDK)/lib/librte_eal/common -VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) - -CFLAGS += -DALLOW_EXPERIMENTAL_API -CFLAGS += -I$(SRCDIR)/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include -CFLAGS += $(WERROR_FLAGS) -O3 - -LDLIBS += -lexecinfo -LDLIBS += -lpthread -LDLIBS += -lgcc_s -LDLIBS += -lrte_kvargs - -EXPORT_MAP := ../../rte_eal_version.map - -# specific to freebsd exec-env -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) := eal.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_hugepage_info.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_debug.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_interrupts.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_alarm.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_dev.c - -# from common dir -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memzone.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_log.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_launch.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_mcfg.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_tailqs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_errno.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hypervisor.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_string_fns.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hexdump.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_devargs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_class.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_bus.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_dev.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_options.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_proc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_fbarray.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_uuid.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_malloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += hotplug_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_elem.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_heap.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_keepalive.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_option.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_service.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_random.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c - -# from arch dir -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_hypervisor.c -SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c -SRCS-y += rte_cycles.c - -CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) - -# workaround for a gcc bug with noreturn attribute -# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 -ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) -CFLAGS_eal_thread.o += -Wno-return-type -CFLAGS_eal_hpet.o += -Wno-return-type -endif - -INC := rte_os.h - -SYMLINK-$(CONFIG_RTE_EXEC_ENV_FREEBSD)-include := $(addprefix include/,$(INC)) - -include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/freebsd/eal/eal.c b/lib/librte_eal/freebsd/eal/eal.c deleted file mode 100644 index 6ae37e7e69..0000000000 --- a/lib/librte_eal/freebsd/eal/eal.c +++ /dev/null @@ -1,1092 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation. - * Copyright(c) 2014 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_hugepages.h" -#include "eal_options.h" -#include "eal_memcfg.h" - -#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) - -/* Allow the application to print its usage message too if set */ -static rte_usage_hook_t rte_application_usage_hook = NULL; -/* early configuration structure, when memory config is not mmapped */ -static struct rte_mem_config early_mem_config; - -/* define fd variable here, because file needs to be kept open for the - * duration of the program, as we hold a write lock on it in the primary proc */ -static int mem_cfg_fd = -1; - -static struct flock wr_lock = { - .l_type = F_WRLCK, - .l_whence = SEEK_SET, - .l_start = offsetof(struct rte_mem_config, memsegs), - .l_len = sizeof(early_mem_config.memsegs), -}; - -/* Address of global and public configuration */ -static struct rte_config rte_config = { - .mem_config = &early_mem_config, -}; - -/* internal configuration (per-core) */ -struct lcore_config lcore_config[RTE_MAX_LCORE]; - -/* internal configuration */ -struct internal_config internal_config; - -/* used by rte_rdtsc() */ -int rte_cycles_vmware_tsc_map; - -/* platform-specific runtime dir */ -static char runtime_dir[PATH_MAX]; - -static const char *default_runtime_dir = "/var/run"; - -int -eal_create_runtime_dir(void) -{ - const char *directory = default_runtime_dir; - const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); - const char *fallback = "/tmp"; - char tmp[PATH_MAX]; - int ret; - - if (getuid() != 0) { - /* try XDG path first, fall back to /tmp */ - if (xdg_runtime_dir != NULL) - directory = xdg_runtime_dir; - else - directory = fallback; - } - /* create DPDK subdirectory under runtime dir */ - ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); - if (ret < 0 || ret == sizeof(tmp)) { - RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); - return -1; - } - - /* create prefix-specific subdirectory under DPDK runtime dir */ - ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", - tmp, eal_get_hugefile_prefix()); - if (ret < 0 || ret == sizeof(runtime_dir)) { - RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); - return -1; - } - - /* create the path if it doesn't exist. no "mkdir -p" here, so do it - * step by step. - */ - ret = mkdir(tmp, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - tmp, strerror(errno)); - return -1; - } - - ret = mkdir(runtime_dir, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - runtime_dir, strerror(errno)); - return -1; - } - - return 0; -} - -int -eal_clean_runtime_dir(void) -{ - /* FreeBSD doesn't need this implemented for now, because, unlike Linux, - * FreeBSD doesn't create per-process files, so no need to clean up. - */ - return 0; -} - - -const char * -rte_eal_get_runtime_dir(void) -{ - return runtime_dir; -} - -/* Return user provided mbuf pool ops name */ -const char * -rte_eal_mbuf_user_pool_ops(void) -{ - return internal_config.user_mbuf_pool_ops_name; -} - -/* Return a pointer to the configuration structure */ -struct rte_config * -rte_eal_get_configuration(void) -{ - return &rte_config; -} - -enum rte_iova_mode -rte_eal_iova_mode(void) -{ - return rte_eal_get_configuration()->iova_mode; -} - -/* parse a sysfs (or other) file containing one integer value */ -int -eal_parse_sysfs_value(const char *filename, unsigned long *val) -{ - FILE *f; - char buf[BUFSIZ]; - char *end = NULL; - - if ((f = fopen(filename, "r")) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", - __func__, filename); - return -1; - } - - if (fgets(buf, sizeof(buf), f) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - *val = strtoul(buf, &end, 0); - if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { - RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - fclose(f); - return 0; -} - - -/* create memory configuration in shared/mmap memory. Take out - * a write lock on the memsegs, so we can auto-detect primary/secondary. - * This means we never close the file while running (auto-close on exit). - * We also don't lock the whole file, so that in future we can use read-locks - * on other parts, e.g. memzones, to detect if there are running secondary - * processes. */ -static int -rte_eal_config_create(void) -{ - size_t page_sz = sysconf(_SC_PAGE_SIZE); - size_t cfg_len = sizeof(*rte_config.mem_config); - size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); - void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; - int retval; - - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return 0; - - /* map the config before base address so that we don't waste a page */ - if (internal_config.base_virtaddr != 0) - rte_mem_cfg_addr = (void *) - RTE_ALIGN_FLOOR(internal_config.base_virtaddr - - sizeof(struct rte_mem_config), page_sz); - else - rte_mem_cfg_addr = NULL; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); - if (mem_cfg_fd < 0) { - RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", - pathname); - return -1; - } - } - - retval = ftruncate(mem_cfg_fd, cfg_len); - if (retval < 0){ - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", - pathname); - return -1; - } - - retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); - if (retval < 0){ - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " - "process running?\n", pathname); - return -1; - } - - /* reserve space for config */ - rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, - &cfg_len_aligned, page_sz, 0, 0); - if (rte_mem_cfg_addr == NULL) { - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); - close(mem_cfg_fd); - mem_cfg_fd = -1; - return -1; - } - - /* remap the actual file into the space we've just reserved */ - mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, - cfg_len_aligned, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); - if (mapped_mem_cfg_addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); - munmap(rte_mem_cfg_addr, cfg_len); - close(mem_cfg_fd); - mem_cfg_fd = -1; - return -1; - } - - memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); - rte_config.mem_config = rte_mem_cfg_addr; - - /* store address of the config in the config itself so that secondary - * processes could later map the config into this exact location - */ - rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; - - return 0; -} - -/* attach to an existing shared memory config */ -static int -rte_eal_config_attach(void) -{ - void *rte_mem_cfg_addr; - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return 0; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR); - if (mem_cfg_fd < 0) { - RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", - pathname); - return -1; - } - } - - rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config), - PROT_READ, MAP_SHARED, mem_cfg_fd, 0); - /* don't close the fd here, it will be closed on reattach */ - if (rte_mem_cfg_addr == MAP_FAILED) { - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - rte_config.mem_config = rte_mem_cfg_addr; - - return 0; -} - -/* reattach the shared config at exact memory location primary process has it */ -static int -rte_eal_config_reattach(void) -{ - struct rte_mem_config *mem_config; - void *rte_mem_cfg_addr; - - if (internal_config.no_shconf) - return 0; - - /* save the address primary process has mapped shared config to */ - rte_mem_cfg_addr = - (void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr; - - /* unmap original config */ - munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); - - /* remap the config at proper address */ - mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, - sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, - mem_cfg_fd, 0); - close(mem_cfg_fd); - mem_cfg_fd = -1; - - if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { - if (mem_config != MAP_FAILED) { - /* errno is stale, don't use */ - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" - " - please use '--" OPT_BASE_VIRTADDR - "' option\n", - rte_mem_cfg_addr, mem_config); - munmap(mem_config, sizeof(struct rte_mem_config)); - return -1; - } - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - rte_config.mem_config = mem_config; - - return 0; -} - -/* Detect if we are a primary or a secondary process */ -enum rte_proc_type_t -eal_proc_type_detect(void) -{ - enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; - const char *pathname = eal_runtime_config_path(); - - /* if there no shared config, there can be no secondary processes */ - if (!internal_config.no_shconf) { - /* if we can open the file but not get a write-lock we are a - * secondary process. NOTE: if we get a file handle back, we - * keep that open and don't close it to prevent a race condition - * between multiple opens. - */ - if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && - (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) - ptype = RTE_PROC_SECONDARY; - } - - RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", - ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); - - return ptype; -} - -/* Sets up rte_config structure with the pointer to shared memory config.*/ -static int -rte_config_init(void) -{ - rte_config.process_type = internal_config.process_type; - - switch (rte_config.process_type){ - case RTE_PROC_PRIMARY: - if (rte_eal_config_create() < 0) - return -1; - eal_mcfg_update_from_internal(); - break; - case RTE_PROC_SECONDARY: - if (rte_eal_config_attach() < 0) - return -1; - eal_mcfg_wait_complete(); - if (eal_mcfg_check_version() < 0) { - RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); - return -1; - } - if (rte_eal_config_reattach() < 0) - return -1; - eal_mcfg_update_internal(); - break; - case RTE_PROC_AUTO: - case RTE_PROC_INVALID: - RTE_LOG(ERR, EAL, "Invalid process type %d\n", - rte_config.process_type); - return -1; - } - - return 0; -} - -/* display usage */ -static void -eal_usage(const char *prgname) -{ - printf("\nUsage: %s ", prgname); - eal_common_usage(); - /* Allow the application to print its usage message too if hook is set */ - if ( rte_application_usage_hook ) { - printf("===== Application Usage =====\n\n"); - rte_application_usage_hook(prgname); - } -} - -/* Set a per-application usage message */ -rte_usage_hook_t -rte_set_application_usage_hook( rte_usage_hook_t usage_func ) -{ - rte_usage_hook_t old_func; - - /* Will be NULL on the first call to denote the last usage routine. */ - old_func = rte_application_usage_hook; - rte_application_usage_hook = usage_func; - - return old_func; -} - -static inline size_t -eal_get_hugepage_mem_size(void) -{ - uint64_t size = 0; - unsigned i, j; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - size += hpi->hugepage_sz * hpi->num_pages[j]; - } - } - } - - return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; -} - -/* Parse the arguments for --log-level only */ -static void -eal_log_level_parse(int argc, char **argv) -{ - int opt; - char **argvopt; - int option_index; - const int old_optind = optind; - const int old_optopt = optopt; - const int old_optreset = optreset; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - optreset = 1; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - int ret; - - /* getopt is not happy, stop right now */ - if (opt == '?') - break; - - ret = (opt == OPT_LOG_LEVEL_NUM) ? - eal_parse_common_option(opt, optarg, &internal_config) : 0; - - /* common parser is not happy */ - if (ret < 0) - break; - } - - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optreset = old_optreset; - optarg = old_optarg; -} - -/* Parse the argument given in the command line of the application */ -static int -eal_parse_args(int argc, char **argv) -{ - int opt, ret; - char **argvopt; - int option_index; - char *prgname = argv[0]; - const int old_optind = optind; - const int old_optopt = optopt; - const int old_optreset = optreset; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - optreset = 1; - opterr = 0; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - /* - * getopt didn't recognise the option, lets parse the - * registered options to see if the flag is valid - */ - if (opt == '?') { - ret = rte_option_parse(argv[optind-1]); - if (ret == 0) - continue; - - eal_usage(prgname); - ret = -1; - goto out; - } - - ret = eal_parse_common_option(opt, optarg, &internal_config); - /* common parser is not happy */ - if (ret < 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - /* common parser handled this option */ - if (ret == 0) - continue; - - switch (opt) { - case OPT_MBUF_POOL_OPS_NAME_NUM: - { - char *ops_name = strdup(optarg); - if (ops_name == NULL) - RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); - else { - /* free old ops name */ - if (internal_config.user_mbuf_pool_ops_name != - NULL) - free(internal_config.user_mbuf_pool_ops_name); - - internal_config.user_mbuf_pool_ops_name = - ops_name; - } - break; - } - case 'h': - eal_usage(prgname); - exit(EXIT_SUCCESS); - default: - if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { - RTE_LOG(ERR, EAL, "Option %c is not supported " - "on FreeBSD\n", opt); - } else if (opt >= OPT_LONG_MIN_NUM && - opt < OPT_LONG_MAX_NUM) { - RTE_LOG(ERR, EAL, "Option %s is not supported " - "on FreeBSD\n", - eal_long_options[option_index].name); - } else { - RTE_LOG(ERR, EAL, "Option %d is not supported " - "on FreeBSD\n", opt); - } - eal_usage(prgname); - ret = -1; - goto out; - } - } - - /* create runtime data directory */ - if (internal_config.no_shconf == 0 && - eal_create_runtime_dir() < 0) { - RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); - ret = -1; - goto out; - } - - if (eal_adjust_config(&internal_config) != 0) { - ret = -1; - goto out; - } - - /* sanity checks */ - if (eal_check_common_options(&internal_config) != 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - - if (optind >= 0) - argv[optind-1] = prgname; - ret = optind-1; - -out: - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optreset = old_optreset; - optarg = old_optarg; - - return ret; -} - -static int -check_socket(const struct rte_memseg_list *msl, void *arg) -{ - int *socket_id = arg; - - if (msl->external) - return 0; - - if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0) - return 1; - - return 0; -} - -static void -eal_check_mem_on_local_socket(void) -{ - int socket_id; - - socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); - - if (rte_memseg_list_walk(check_socket, &socket_id) == 0) - RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); -} - - -static int -sync_func(__attribute__((unused)) void *arg) -{ - return 0; -} - -/* return non-zero if hugepages are enabled. */ -int rte_eal_has_hugepages(void) -{ - return !internal_config.no_hugetlbfs; -} - -/* Abstraction for port I/0 privilege */ -int -rte_eal_iopl_init(void) -{ - static int fd = -1; - - if (fd < 0) - fd = open("/dev/io", O_RDWR); - - if (fd < 0) - return -1; - /* keep fd open for iopl */ - return 0; -} - -static void rte_eal_init_alert(const char *msg) -{ - fprintf(stderr, "EAL: FATAL: %s\n", msg); - RTE_LOG(ERR, EAL, "%s\n", msg); -} - -/* Launch threads, called at application init(). */ -int -rte_eal_init(int argc, char **argv) -{ - int i, fctret, ret; - pthread_t thread_id; - static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - - /* checks if the machine is adequate */ - if (!rte_cpu_is_supported()) { - rte_eal_init_alert("unsupported cpu type."); - rte_errno = ENOTSUP; - return -1; - } - - if (!rte_atomic32_test_and_set(&run_once)) { - rte_eal_init_alert("already called initialization."); - rte_errno = EALREADY; - return -1; - } - - thread_id = pthread_self(); - - eal_reset_internal_config(&internal_config); - - /* set log level as early as possible */ - eal_log_level_parse(argc, argv); - - if (rte_eal_cpu_init() < 0) { - rte_eal_init_alert("Cannot detect lcores."); - rte_errno = ENOTSUP; - return -1; - } - - fctret = eal_parse_args(argc, argv); - if (fctret < 0) { - rte_eal_init_alert("Invalid 'command line' arguments."); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - /* FreeBSD always uses legacy memory model */ - internal_config.legacy_mem = true; - - if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins"); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - if (eal_option_device_parse()) { - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - if (rte_config_init() < 0) { - rte_eal_init_alert("Cannot init config"); - return -1; - } - - if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread"); - return -1; - } - - if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init alarm"); - /* rte_eal_alarm_init sets rte_errno on failure. */ - return -1; - } - - /* Put mp channel init before bus scan so that we can init the vdev - * bus through mp channel in the secondary process before the bus scan. - */ - if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { - rte_eal_init_alert("failed to init mp channel"); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - rte_errno = EFAULT; - return -1; - } - } - - if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices"); - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - /* if no EAL option "--iova-mode=", use bus IOVA scheme */ - if (internal_config.iova_mode == RTE_IOVA_DC) { - /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */ - enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); - - if (iova_mode == RTE_IOVA_DC) - iova_mode = RTE_IOVA_PA; - rte_eal_get_configuration()->iova_mode = iova_mode; - } else { - rte_eal_get_configuration()->iova_mode = - internal_config.iova_mode; - } - - RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", - rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); - - if (internal_config.no_hugetlbfs == 0) { - /* rte_config isn't initialized yet */ - ret = internal_config.process_type == RTE_PROC_PRIMARY ? - eal_hugepage_info_init() : - eal_hugepage_info_read(); - if (ret < 0) { - rte_eal_init_alert("Cannot get hugepage information."); - rte_errno = EACCES; - rte_atomic32_clear(&run_once); - return -1; - } - } - - if (internal_config.memory == 0 && internal_config.force_sockets == 0) { - if (internal_config.no_hugetlbfs) - internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; - else - internal_config.memory = eal_get_hugepage_mem_size(); - } - - if (internal_config.vmware_tsc_map == 1) { -#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT - rte_cycles_vmware_tsc_map = 1; - RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " - "you must have monitor_control.pseudo_perfctr = TRUE\n"); -#else - RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " - "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); -#endif - } - - /* in secondary processes, memory init may allocate additional fbarrays - * not present in primary processes, so to avoid any potential issues, - * initialize memzones first. - */ - if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory"); - rte_errno = ENOMEM; - return -1; - } - - if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects"); - rte_errno = EFAULT; - return -1; - } - - if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers"); - rte_errno = ENOTSUP; - return -1; - } - - eal_check_mem_on_local_socket(); - - eal_thread_init_master(rte_config.master_lcore); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n", - rte_config.master_lcore, thread_id, cpuset, - ret == 0 ? "" : "..."); - - RTE_LCORE_FOREACH_SLAVE(i) { - - /* - * create communication pipes between master thread - * and children - */ - if (pipe(lcore_config[i].pipe_master2slave) < 0) - rte_panic("Cannot create pipe\n"); - if (pipe(lcore_config[i].pipe_slave2master) < 0) - rte_panic("Cannot create pipe\n"); - - lcore_config[i].state = WAIT; - - /* create a thread for each lcore */ - ret = pthread_create(&lcore_config[i].thread_id, NULL, - eal_thread_loop, NULL); - if (ret != 0) - rte_panic("Cannot create thread\n"); - - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, sizeof(thread_name), - "lcore-slave-%d", i); - rte_thread_setname(lcore_config[i].thread_id, thread_name); - } - - /* - * Launch a dummy function on all slave lcores, so that master lcore - * knows they are all ready when this function returns. - */ - rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); - rte_eal_mp_wait_lcore(); - - /* initialize services so vdevs register service during bus_probe. */ - ret = rte_service_init(); - if (ret) { - rte_eal_init_alert("rte_service_init() failed"); - rte_errno = ENOEXEC; - return -1; - } - - /* Probe all the buses and devices/drivers on them */ - if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices"); - rte_errno = ENOTSUP; - return -1; - } - - /* initialize default service/lcore mappings and start running. Ignore - * -ENOTSUP, as it indicates no service coremask passed to EAL. - */ - ret = rte_service_start_with_defaults(); - if (ret < 0 && ret != -ENOTSUP) { - rte_errno = ENOEXEC; - return -1; - } - - /* - * Clean up unused files in runtime directory. We do this at the end of - * init and not at the beginning because we want to clean stuff up - * whether we are primary or secondary process, but we cannot remove - * primary process' files because secondary should be able to run even - * if primary process is dead. - * - * In no_shconf mode, no runtime directory is created in the first - * place, so no cleanup needed. - */ - if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { - rte_eal_init_alert("Cannot clear runtime directory\n"); - return -1; - } - - eal_mcfg_complete(); - - /* Call each registered callback, if enabled */ - rte_option_init(); - - return fctret; -} - -int -rte_eal_cleanup(void) -{ - rte_service_finalize(); - rte_mp_channel_cleanup(); - eal_cleanup_config(&internal_config); - return 0; -} - -enum rte_proc_type_t -rte_eal_process_type(void) -{ - return rte_config.process_type; -} - -int rte_eal_has_pci(void) -{ - return !internal_config.no_pci; -} - -int rte_eal_create_uio_dev(void) -{ - return internal_config.create_uio_dev; -} - -enum rte_intr_mode -rte_eal_vfio_intr_mode(void) -{ - return RTE_INTR_MODE_NONE; -} - -int rte_vfio_setup_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *vfio_dev_fd, - __rte_unused struct vfio_device_info *device_info) -{ - return -1; -} - -int rte_vfio_release_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int fd) -{ - return -1; -} - -int rte_vfio_enable(__rte_unused const char *modname) -{ - return -1; -} - -int rte_vfio_is_enabled(__rte_unused const char *modname) -{ - return 0; -} - -int rte_vfio_noiommu_is_enabled(void) -{ - return 0; -} - -int rte_vfio_clear_group(__rte_unused int vfio_group_fd) -{ - return 0; -} - -int -rte_vfio_get_group_num(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *iommu_group_num) -{ - return -1; -} - -int -rte_vfio_get_container_fd(void) -{ - return -1; -} - -int -rte_vfio_get_group_fd(__rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_create(void) -{ - return -1; -} - -int -rte_vfio_container_destroy(__rte_unused int container_fd) -{ - return -1; -} - -int -rte_vfio_container_group_bind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_group_unbind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_dma_map(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -int -rte_vfio_container_dma_unmap(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} diff --git a/lib/librte_eal/freebsd/eal/eal_alarm.c b/lib/librte_eal/freebsd/eal/eal_alarm.c deleted file mode 100644 index 51ea4b8c08..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_alarm.c +++ /dev/null @@ -1,314 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_alarm_private.h" - -#define NS_PER_US 1000 - -#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ -#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW -#else -#define CLOCK_TYPE_ID CLOCK_MONOTONIC -#endif - -struct alarm_entry { - LIST_ENTRY(alarm_entry) next; - struct rte_intr_handle handle; - struct timespec time; - rte_eal_alarm_callback cb_fn; - void *cb_arg; - volatile uint8_t executing; - volatile pthread_t executing_id; -}; - -static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); -static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; - -static struct rte_intr_handle intr_handle = {.fd = -1 }; -static void eal_alarm_callback(void *arg); - -int -rte_eal_alarm_init(void) -{ - intr_handle.type = RTE_INTR_HANDLE_ALARM; - - /* on FreeBSD, timers don't use fd's, and their identifiers are stored - * in separate namespace from fd's, so using any value is OK. however, - * EAL interrupts handler expects fd's to be unique, so use an actual fd - * to guarantee unique timer identifier. - */ - intr_handle.fd = open("/dev/zero", O_RDONLY); - - return 0; -} - -static inline int -timespec_cmp(const struct timespec *now, const struct timespec *at) -{ - if (now->tv_sec < at->tv_sec) - return -1; - if (now->tv_sec > at->tv_sec) - return 1; - if (now->tv_nsec < at->tv_nsec) - return -1; - if (now->tv_nsec > at->tv_nsec) - return 1; - return 0; -} - -static inline uint64_t -diff_ns(struct timespec *now, struct timespec *at) -{ - uint64_t now_ns, at_ns; - - if (timespec_cmp(now, at) >= 0) - return 0; - - now_ns = now->tv_sec * NS_PER_S + now->tv_nsec; - at_ns = at->tv_sec * NS_PER_S + at->tv_nsec; - - return at_ns - now_ns; -} - -int -eal_alarm_get_timeout_ns(uint64_t *val) -{ - struct alarm_entry *ap; - struct timespec now; - - if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) - return -1; - - if (LIST_EMPTY(&alarm_list)) - return -1; - - ap = LIST_FIRST(&alarm_list); - - *val = diff_ns(&now, &ap->time); - - return 0; -} - -static int -unregister_current_callback(void) -{ - struct alarm_entry *ap; - int ret = 0; - - if (!LIST_EMPTY(&alarm_list)) { - ap = LIST_FIRST(&alarm_list); - - do { - ret = rte_intr_callback_unregister(&intr_handle, - eal_alarm_callback, &ap->time); - } while (ret == -EAGAIN); - } - - return ret; -} - -static int -register_first_callback(void) -{ - struct alarm_entry *ap; - int ret = 0; - - if (!LIST_EMPTY(&alarm_list)) { - ap = LIST_FIRST(&alarm_list); - - /* register a new callback */ - ret = rte_intr_callback_register(&intr_handle, - eal_alarm_callback, &ap->time); - } - return ret; -} - -static void -eal_alarm_callback(void *arg __rte_unused) -{ - struct timespec now; - struct alarm_entry *ap; - - rte_spinlock_lock(&alarm_list_lk); - ap = LIST_FIRST(&alarm_list); - - if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) - return; - - while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) { - ap->executing = 1; - ap->executing_id = pthread_self(); - rte_spinlock_unlock(&alarm_list_lk); - - ap->cb_fn(ap->cb_arg); - - rte_spinlock_lock(&alarm_list_lk); - - LIST_REMOVE(ap, next); - free(ap); - - ap = LIST_FIRST(&alarm_list); - } - - /* timer has been deleted from the kqueue, so recreate it if needed */ - register_first_callback(); - - rte_spinlock_unlock(&alarm_list_lk); -} - - -int -rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct alarm_entry *ap, *new_alarm; - struct timespec now; - uint64_t ns; - int ret = 0; - - /* check parameters, also ensure us won't cause a uint64_t overflow */ - if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) - return -EINVAL; - - new_alarm = calloc(1, sizeof(*new_alarm)); - if (new_alarm == NULL) - return -ENOMEM; - - /* use current time to calculate absolute time of alarm */ - clock_gettime(CLOCK_TYPE_ID, &now); - - ns = us * NS_PER_US; - - new_alarm->cb_fn = cb_fn; - new_alarm->cb_arg = cb_arg; - new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S; - new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S); - - rte_spinlock_lock(&alarm_list_lk); - - if (LIST_EMPTY(&alarm_list)) - LIST_INSERT_HEAD(&alarm_list, new_alarm, next); - else { - LIST_FOREACH(ap, &alarm_list, next) { - if (timespec_cmp(&new_alarm->time, &ap->time) < 0) { - LIST_INSERT_BEFORE(ap, new_alarm, next); - break; - } - if (LIST_NEXT(ap, next) == NULL) { - LIST_INSERT_AFTER(ap, new_alarm, next); - break; - } - } - } - - /* re-register first callback just in case */ - register_first_callback(); - - rte_spinlock_unlock(&alarm_list_lk); - - return ret; -} - -int -rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct alarm_entry *ap, *ap_prev; - int count = 0; - int err = 0; - int executing; - - if (!cb_fn) { - rte_errno = EINVAL; - return -1; - } - - do { - executing = 0; - rte_spinlock_lock(&alarm_list_lk); - /* remove any matches at the start of the list */ - while (1) { - ap = LIST_FIRST(&alarm_list); - if (ap == NULL) - break; - if (cb_fn != ap->cb_fn) - break; - if (cb_arg != ap->cb_arg && cb_arg != (void *) -1) - break; - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - } else { - /* If calling from other context, mark that - * alarm is executing so loop can spin till it - * finish. Otherwise we are trying to cancel - * ourselves - mark it by EINPROGRESS. - */ - if (pthread_equal(ap->executing_id, - pthread_self()) == 0) - executing++; - else - err = EINPROGRESS; - - break; - } - } - ap_prev = ap; - - /* now go through list, removing entries not at start */ - LIST_FOREACH(ap, &alarm_list, next) { - /* this won't be true first time through */ - if (cb_fn == ap->cb_fn && - (cb_arg == (void *)-1 || - cb_arg == ap->cb_arg)) { - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - ap = ap_prev; - } else if (pthread_equal(ap->executing_id, - pthread_self()) == 0) { - executing++; - } else { - err = EINPROGRESS; - } - } - ap_prev = ap; - } - rte_spinlock_unlock(&alarm_list_lk); - } while (executing != 0); - - if (count == 0 && err == 0) - rte_errno = ENOENT; - else if (err) - rte_errno = err; - - rte_spinlock_lock(&alarm_list_lk); - - /* unregister if no alarms left, otherwise re-register first */ - if (LIST_EMPTY(&alarm_list)) - unregister_current_callback(); - else - register_first_callback(); - - rte_spinlock_unlock(&alarm_list_lk); - - return count; -} diff --git a/lib/librte_eal/freebsd/eal/eal_alarm_private.h b/lib/librte_eal/freebsd/eal/eal_alarm_private.h deleted file mode 100644 index 65c7115184..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_alarm_private.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2018 Intel Corporation - */ - -#ifndef EAL_ALARM_PRIVATE_H -#define EAL_ALARM_PRIVATE_H - -#include - -/* - * FreeBSD needs a back-channel communication mechanism between interrupt and - * alarm thread, because on FreeBSD, timer period is set up inside the interrupt - * API and not inside alarm API like on Linux. - */ - -int -eal_alarm_get_timeout_ns(uint64_t *val); - -#endif // EAL_ALARM_PRIVATE_H diff --git a/lib/librte_eal/freebsd/eal/eal_cpuflags.c b/lib/librte_eal/freebsd/eal/eal_cpuflags.c deleted file mode 100644 index 69b161ea65..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_cpuflags.c +++ /dev/null @@ -1,21 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2018 Mellanox Technologies, Ltd - */ - -#include -#include - -unsigned long -rte_cpu_getauxval(unsigned long type __rte_unused) -{ - /* not implemented */ - return 0; -} - -int -rte_cpu_strcmp_auxval(unsigned long type __rte_unused, - const char *str __rte_unused) -{ - /* not implemented */ - return -1; -} diff --git a/lib/librte_eal/freebsd/eal/eal_debug.c b/lib/librte_eal/freebsd/eal/eal_debug.c deleted file mode 100644 index 5d92500bf5..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_debug.c +++ /dev/null @@ -1,92 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#ifdef RTE_BACKTRACE -#include -#endif -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define BACKTRACE_SIZE 256 - -/* dump the stack of the calling core */ -void rte_dump_stack(void) -{ -#ifdef RTE_BACKTRACE - void *func[BACKTRACE_SIZE]; - char **symb = NULL; - int size; - - size = backtrace(func, BACKTRACE_SIZE); - symb = backtrace_symbols(func, size); - - if (symb == NULL) - return; - - while (size > 0) { - rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, - "%d: [%s]\n", size, symb[size - 1]); - size --; - } - - free(symb); -#endif /* RTE_BACKTRACE */ -} - -/* not implemented in this environment */ -void rte_dump_registers(void) -{ - return; -} - -/* call abort(), it will generate a coredump if enabled */ -void __rte_panic(const char *funcname, const char *format, ...) -{ - va_list ap; - - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - rte_dump_stack(); - rte_dump_registers(); - abort(); -} - -/* - * Like rte_panic this terminates the application. However, no traceback is - * provided and no core-dump is generated. - */ -void -rte_exit(int exit_code, const char *format, ...) -{ - va_list ap; - - if (exit_code != 0) - RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" - " Cause: ", exit_code); - - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - -#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR - if (rte_eal_cleanup() != 0) - RTE_LOG(CRIT, EAL, - "EAL could not release all resources\n"); - exit(exit_code); -#else - rte_dump_stack(); - rte_dump_registers(); - abort(); -#endif -} diff --git a/lib/librte_eal/freebsd/eal/eal_dev.c b/lib/librte_eal/freebsd/eal/eal_dev.c deleted file mode 100644 index 8e06e70890..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_dev.c +++ /dev/null @@ -1,35 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2018 Intel Corporation - */ - -#include -#include -#include - -int -rte_dev_event_monitor_start(void) -{ - RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); - return -1; -} - -int -rte_dev_event_monitor_stop(void) -{ - RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); - return -1; -} - -int -rte_dev_hotplug_handle_enable(void) -{ - RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); - return -1; -} - -int -rte_dev_hotplug_handle_disable(void) -{ - RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); - return -1; -} diff --git a/lib/librte_eal/freebsd/eal/eal_hugepage_info.c b/lib/librte_eal/freebsd/eal/eal_hugepage_info.c deleted file mode 100644 index 32012e1427..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_hugepage_info.c +++ /dev/null @@ -1,156 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ -#include -#include -#include -#include - -#include -#include -#include "eal_hugepages.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" - -#define CONTIGMEM_DEV "/dev/contigmem" - -/* - * Uses mmap to create a shared memory area for storage of data - * Used in this file to store the hugepage file map on disk - */ -static void * -map_shared_memory(const char *filename, const size_t mem_size, int flags) -{ - void *retval; - int fd = open(filename, flags, 0600); - if (fd < 0) - return NULL; - if (ftruncate(fd, mem_size) < 0) { - close(fd); - return NULL; - } - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - return retval; -} - -static void * -open_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR); -} - -static void * -create_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); -} - -/* - * No hugepage support on freebsd, but we dummy it, using contigmem driver - */ -int -eal_hugepage_info_init(void) -{ - size_t sysctl_size; - int num_buffers, fd, error; - int64_t buffer_size; - /* re-use the linux "internal config" structure for our memory data */ - struct hugepage_info *hpi = &internal_config.hugepage_info[0]; - struct hugepage_info *tmp_hpi; - unsigned int i; - - internal_config.num_hugepage_sizes = 1; - - sysctl_size = sizeof(num_buffers); - error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers, - &sysctl_size, NULL, 0); - - if (error != 0) { - RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n"); - return -1; - } - - sysctl_size = sizeof(buffer_size); - error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size, - &sysctl_size, NULL, 0); - - if (error != 0) { - RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n"); - return -1; - } - - fd = open(CONTIGMEM_DEV, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n"); - return -1; - } - - if (buffer_size >= 1<<30) - RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n", - num_buffers, (int)(buffer_size>>30)); - else if (buffer_size >= 1<<20) - RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n", - num_buffers, (int)(buffer_size>>20)); - else - RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n", - num_buffers, (int)(buffer_size>>10)); - - strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir)); - hpi->hugepage_sz = buffer_size; - hpi->num_pages[0] = num_buffers; - hpi->lock_descriptor = fd; - - /* for no shared files mode, do not create shared memory config */ - if (internal_config.no_shconf) - return 0; - - tmp_hpi = create_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL ) { - RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); - return -1; - } - - memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); - - /* we've copied file descriptors along with everything else, but they - * will be invalid in secondary process, so overwrite them - */ - for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { - struct hugepage_info *tmp = &tmp_hpi[i]; - tmp->lock_descriptor = -1; - } - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - - return 0; -} - -/* copy stuff from shared info into internal config */ -int -eal_hugepage_info_read(void) -{ - struct hugepage_info *hpi = &internal_config.hugepage_info[0]; - struct hugepage_info *tmp_hpi; - - internal_config.num_hugepage_sizes = 1; - - tmp_hpi = open_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL) { - RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); - return -1; - } - - memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - return 0; -} diff --git a/lib/librte_eal/freebsd/eal/eal_interrupts.c b/lib/librte_eal/freebsd/eal/eal_interrupts.c deleted file mode 100644 index 00991f26a9..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_interrupts.c +++ /dev/null @@ -1,685 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_alarm_private.h" - -#define MAX_INTR_EVENTS 16 - -/** - * union buffer for reading on different devices - */ -union rte_intr_read_buffer { - char charbuf[16]; /* for others */ -}; - -TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); -TAILQ_HEAD(rte_intr_source_list, rte_intr_source); - -struct rte_intr_callback { - TAILQ_ENTRY(rte_intr_callback) next; - rte_intr_callback_fn cb_fn; /**< callback address */ - void *cb_arg; /**< parameter for callback */ - uint8_t pending_delete; /**< delete after callback is called */ - rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ -}; - -struct rte_intr_source { - TAILQ_ENTRY(rte_intr_source) next; - struct rte_intr_handle intr_handle; /**< interrupt handle */ - struct rte_intr_cb_list callbacks; /**< user callbacks */ - uint32_t active; -}; - -/* global spinlock for interrupt data operation */ -static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; - -/* interrupt sources list */ -static struct rte_intr_source_list intr_sources; - -/* interrupt handling thread */ -static pthread_t intr_thread; - -static volatile int kq = -1; - -static int -intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke) -{ - /* alarm callbacks are special case */ - if (ih->type == RTE_INTR_HANDLE_ALARM) { - uint64_t timeout_ns; - - /* get soonest alarm timeout */ - if (eal_alarm_get_timeout_ns(&timeout_ns) < 0) - return -1; - - ke->filter = EVFILT_TIMER; - /* timers are one shot */ - ke->flags |= EV_ONESHOT; - ke->fflags = NOTE_NSECONDS; - ke->data = timeout_ns; - } else { - ke->filter = EVFILT_READ; - } - ke->ident = ih->fd; - - return 0; -} - -int -rte_intr_callback_register(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb, void *cb_arg) -{ - struct rte_intr_callback *callback; - struct rte_intr_source *src; - int ret, add_event = 0; - - /* first do parameter checking */ - if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { - RTE_LOG(ERR, EAL, - "Registering with invalid input parameter\n"); - return -EINVAL; - } - if (kq < 0) { - RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq); - return -ENODEV; - } - - rte_spinlock_lock(&intr_lock); - - /* find the source for this intr_handle */ - TAILQ_FOREACH(src, &intr_sources, next) { - if (src->intr_handle.fd == intr_handle->fd) - break; - } - - /* if this is an alarm interrupt and it already has a callback, - * then we don't want to create a new callback because the only - * thing on the list should be eal_alarm_callback() and we may - * be called just to reset the timer. - */ - if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM && - !TAILQ_EMPTY(&src->callbacks)) { - callback = NULL; - } else { - /* allocate a new interrupt callback entity */ - callback = calloc(1, sizeof(*callback)); - if (callback == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - ret = -ENOMEM; - goto fail; - } - callback->cb_fn = cb; - callback->cb_arg = cb_arg; - callback->pending_delete = 0; - callback->ucb_fn = NULL; - - if (src == NULL) { - src = calloc(1, sizeof(*src)); - if (src == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - ret = -ENOMEM; - goto fail; - } else { - src->intr_handle = *intr_handle; - TAILQ_INIT(&src->callbacks); - TAILQ_INSERT_TAIL(&intr_sources, src, next); - } - } - - /* we had no interrupts for this */ - if (TAILQ_EMPTY(&src->callbacks)) - add_event = 1; - - TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); - } - - /* add events to the queue. timer events are special as we need to - * re-set the timer. - */ - if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) { - struct kevent ke; - - memset(&ke, 0, sizeof(ke)); - ke.flags = EV_ADD; /* mark for addition to the queue */ - - if (intr_source_to_kevent(intr_handle, &ke) < 0) { - RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n"); - ret = -ENODEV; - goto fail; - } - - /** - * add the intr file descriptor into wait list. - */ - if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { - /* currently, nic_uio does not support interrupts, so - * this error will always be triggered and output to the - * user. so, don't output it unless debug log level set. - */ - if (errno == ENODEV) - RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n", - src->intr_handle.fd); - else - RTE_LOG(ERR, EAL, "Error adding fd %d " - "kevent, %s\n", - src->intr_handle.fd, - strerror(errno)); - ret = -errno; - goto fail; - } - } - rte_spinlock_unlock(&intr_lock); - - return 0; -fail: - /* clean up */ - if (src != NULL) { - if (callback != NULL) - TAILQ_REMOVE(&(src->callbacks), callback, next); - if (TAILQ_EMPTY(&(src->callbacks))) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - } - free(callback); - rte_spinlock_unlock(&intr_lock); - return ret; -} - -int -rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb_fn, void *cb_arg, - rte_intr_unregister_callback_fn ucb_fn) -{ - int ret; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - - /* do parameter checking first */ - if (intr_handle == NULL || intr_handle->fd < 0) { - RTE_LOG(ERR, EAL, - "Unregistering with invalid input parameter\n"); - return -EINVAL; - } - - if (kq < 0) { - RTE_LOG(ERR, EAL, "Kqueue is not active\n"); - return -ENODEV; - } - - rte_spinlock_lock(&intr_lock); - - /* check if the insterrupt source for the fd is existent */ - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == intr_handle->fd) - break; - - /* No interrupt source registered for the fd */ - if (src == NULL) { - ret = -ENOENT; - - /* only usable if the source is active */ - } else if (src->active == 0) { - ret = -EAGAIN; - - } else { - ret = 0; - - /* walk through the callbacks and mark all that match. */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - next = TAILQ_NEXT(cb, next); - if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || - cb->cb_arg == cb_arg)) { - cb->pending_delete = 1; - cb->ucb_fn = ucb_fn; - ret++; - } - } - } - - rte_spinlock_unlock(&intr_lock); - - return ret; -} - -int -rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb_fn, void *cb_arg) -{ - int ret; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - - /* do parameter checking first */ - if (intr_handle == NULL || intr_handle->fd < 0) { - RTE_LOG(ERR, EAL, - "Unregistering with invalid input parameter\n"); - return -EINVAL; - } - if (kq < 0) { - RTE_LOG(ERR, EAL, "Kqueue is not active\n"); - return -ENODEV; - } - - rte_spinlock_lock(&intr_lock); - - /* check if the insterrupt source for the fd is existent */ - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == intr_handle->fd) - break; - - /* No interrupt source registered for the fd */ - if (src == NULL) { - ret = -ENOENT; - - /* interrupt source has some active callbacks right now. */ - } else if (src->active != 0) { - ret = -EAGAIN; - - /* ok to remove. */ - } else { - struct kevent ke; - - ret = 0; - - /* remove it from the kqueue */ - memset(&ke, 0, sizeof(ke)); - ke.flags = EV_DELETE; /* mark for deletion from the queue */ - - if (intr_source_to_kevent(intr_handle, &ke) < 0) { - RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); - ret = -ENODEV; - goto out; - } - - /** - * remove intr file descriptor from wait list. - */ - if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { - RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n", - src->intr_handle.fd, strerror(errno)); - /* removing non-existent even is an expected condition - * in some circumstances (e.g. oneshot events). - */ - } - - /*walk through the callbacks and remove all that match. */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - next = TAILQ_NEXT(cb, next); - if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || - cb->cb_arg == cb_arg)) { - TAILQ_REMOVE(&src->callbacks, cb, next); - free(cb); - ret++; - } - } - - /* all callbacks for that source are removed. */ - if (TAILQ_EMPTY(&src->callbacks)) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - } -out: - rte_spinlock_unlock(&intr_lock); - - return ret; -} - -int -rte_intr_enable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type) { - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -int -rte_intr_disable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type) { - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -int -rte_intr_ack(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - return -1; -} - -static void -eal_intr_process_interrupts(struct kevent *events, int nfds) -{ - struct rte_intr_callback active_cb; - union rte_intr_read_buffer buf; - struct rte_intr_callback *cb, *next; - struct rte_intr_source *src; - bool call = false; - int n, bytes_read; - struct kevent ke; - - for (n = 0; n < nfds; n++) { - int event_fd = events[n].ident; - - rte_spinlock_lock(&intr_lock); - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == event_fd) - break; - if (src == NULL) { - rte_spinlock_unlock(&intr_lock); - continue; - } - - /* mark this interrupt source as active and release the lock. */ - src->active = 1; - rte_spinlock_unlock(&intr_lock); - - /* set the length to be read dor different handle type */ - switch (src->intr_handle.type) { - case RTE_INTR_HANDLE_ALARM: - bytes_read = 0; - call = true; - break; - case RTE_INTR_HANDLE_VDEV: - case RTE_INTR_HANDLE_EXT: - bytes_read = 0; - call = true; - break; - case RTE_INTR_HANDLE_DEV_EVENT: - bytes_read = 0; - call = true; - break; - default: - bytes_read = 1; - break; - } - - if (bytes_read > 0) { - /** - * read out to clear the ready-to-be-read flag - * for epoll_wait. - */ - bytes_read = read(event_fd, &buf, bytes_read); - if (bytes_read < 0) { - if (errno == EINTR || errno == EWOULDBLOCK) - continue; - - RTE_LOG(ERR, EAL, "Error reading from file " - "descriptor %d: %s\n", - event_fd, - strerror(errno)); - } else if (bytes_read == 0) - RTE_LOG(ERR, EAL, "Read nothing from file " - "descriptor %d\n", event_fd); - else - call = true; - } - - /* grab a lock, again to call callbacks and update status. */ - rte_spinlock_lock(&intr_lock); - - if (call) { - /* Finally, call all callbacks. */ - TAILQ_FOREACH(cb, &src->callbacks, next) { - - /* make a copy and unlock. */ - active_cb = *cb; - rte_spinlock_unlock(&intr_lock); - - /* call the actual callback */ - active_cb.cb_fn(active_cb.cb_arg); - - /*get the lock back. */ - rte_spinlock_lock(&intr_lock); - } - } - - /* we done with that interrupt source, release it. */ - src->active = 0; - - /* check if any callback are supposed to be removed */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - next = TAILQ_NEXT(cb, next); - if (cb->pending_delete) { - /* remove it from the kqueue */ - memset(&ke, 0, sizeof(ke)); - /* mark for deletion from the queue */ - ke.flags = EV_DELETE; - - if (intr_source_to_kevent(&src->intr_handle, &ke) < 0) { - RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); - rte_spinlock_unlock(&intr_lock); - return; - } - - /** - * remove intr file descriptor from wait list. - */ - if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { - RTE_LOG(ERR, EAL, "Error removing fd %d kevent, " - "%s\n", src->intr_handle.fd, - strerror(errno)); - /* removing non-existent even is an expected - * condition in some circumstances - * (e.g. oneshot events). - */ - } - - TAILQ_REMOVE(&src->callbacks, cb, next); - if (cb->ucb_fn) - cb->ucb_fn(&src->intr_handle, cb->cb_arg); - free(cb); - } - } - - /* all callbacks for that source are removed. */ - if (TAILQ_EMPTY(&src->callbacks)) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - - rte_spinlock_unlock(&intr_lock); - } -} - -static void * -eal_intr_thread_main(void *arg __rte_unused) -{ - struct kevent events[MAX_INTR_EVENTS]; - int nfds; - - /* host thread, never break out */ - for (;;) { - /* do not change anything, just wait */ - nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL); - - /* kevent fail */ - if (nfds < 0) { - if (errno == EINTR) - continue; - RTE_LOG(ERR, EAL, - "kevent returns with fail\n"); - break; - } - /* kevent timeout, will never happen here */ - else if (nfds == 0) - continue; - - /* kevent has at least one fd ready to read */ - eal_intr_process_interrupts(events, nfds); - } - close(kq); - kq = -1; - return NULL; -} - -int -rte_eal_intr_init(void) -{ - int ret = 0; - - /* init the global interrupt source head */ - TAILQ_INIT(&intr_sources); - - kq = kqueue(); - if (kq < 0) { - RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n"); - return -1; - } - - /* create the host thread to wait/handle the interrupt */ - ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, - eal_intr_thread_main, NULL); - if (ret != 0) { - rte_errno = -ret; - RTE_LOG(ERR, EAL, - "Failed to create thread for interrupt handling\n"); - } - - return ret; -} - -int -rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, - int epfd, int op, unsigned int vec, void *data) -{ - RTE_SET_USED(intr_handle); - RTE_SET_USED(epfd); - RTE_SET_USED(op); - RTE_SET_USED(vec); - RTE_SET_USED(data); - - return -ENOTSUP; -} - -int -rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) -{ - RTE_SET_USED(intr_handle); - RTE_SET_USED(nb_efd); - - return 0; -} - -void -rte_intr_efd_disable(struct rte_intr_handle *intr_handle) -{ - RTE_SET_USED(intr_handle); -} - -int -rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) -{ - RTE_SET_USED(intr_handle); - return 0; -} - -int -rte_intr_allow_others(struct rte_intr_handle *intr_handle) -{ - RTE_SET_USED(intr_handle); - return 1; -} - -int -rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) -{ - RTE_SET_USED(intr_handle); - return 0; -} - -int -rte_epoll_wait(int epfd, struct rte_epoll_event *events, - int maxevents, int timeout) -{ - RTE_SET_USED(epfd); - RTE_SET_USED(events); - RTE_SET_USED(maxevents); - RTE_SET_USED(timeout); - - return -ENOTSUP; -} - -int -rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event) -{ - RTE_SET_USED(epfd); - RTE_SET_USED(op); - RTE_SET_USED(fd); - RTE_SET_USED(event); - - return -ENOTSUP; -} - -int -rte_intr_tls_epfd(void) -{ - return -ENOTSUP; -} - -void -rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) -{ - RTE_SET_USED(intr_handle); -} - -int rte_thread_is_intr(void) -{ - return pthread_equal(intr_thread, pthread_self()); -} diff --git a/lib/librte_eal/freebsd/eal/eal_lcore.c b/lib/librte_eal/freebsd/eal/eal_lcore.c deleted file mode 100644 index d9ef4bc9c5..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_lcore.c +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include - -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" - -/* No topology information available on FreeBSD including NUMA info */ -unsigned -eal_cpu_core_id(__rte_unused unsigned lcore_id) -{ - return 0; -} - -static int -eal_get_ncpus(void) -{ - static int ncpu = -1; - int mib[2] = {CTL_HW, HW_NCPU}; - size_t len = sizeof(ncpu); - - if (ncpu < 0) { - sysctl(mib, 2, &ncpu, &len, NULL, 0); - RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu); - } - return ncpu; -} - -unsigned -eal_cpu_socket_id(__rte_unused unsigned cpu_id) -{ - return 0; -} - -/* Check if a cpu is present by the presence of the - * cpu information for it. - */ -int -eal_cpu_detected(unsigned lcore_id) -{ - const unsigned ncpus = eal_get_ncpus(); - return lcore_id < ncpus; -} diff --git a/lib/librte_eal/freebsd/eal/eal_memalloc.c b/lib/librte_eal/freebsd/eal/eal_memalloc.c deleted file mode 100644 index 6893448db7..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_memalloc.c +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017-2018 Intel Corporation - */ - -#include - -#include -#include -#include - -#include "eal_memalloc.h" - -int -eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused, - int __rte_unused n_segs, size_t __rte_unused page_sz, - int __rte_unused socket, bool __rte_unused exact) -{ - RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); - return -1; -} - -struct rte_memseg * -eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket) -{ - RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); - return NULL; -} - -int -eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused) -{ - RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); - return -1; -} - -int -eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused, - int n_segs __rte_unused) -{ - RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); - return -1; -} - -int -eal_memalloc_sync_with_primary(void) -{ - RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); - return -1; -} - -int -eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused) -{ - return -ENOTSUP; -} - -int -eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused, - int fd __rte_unused) -{ - return -ENOTSUP; -} - -int -eal_memalloc_set_seg_list_fd(int list_idx __rte_unused, int fd __rte_unused) -{ - return -ENOTSUP; -} - -int -eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused, - int seg_idx __rte_unused, size_t *offset __rte_unused) -{ - return -ENOTSUP; -} - -int -eal_memalloc_init(void) -{ - return 0; -} diff --git a/lib/librte_eal/freebsd/eal/eal_memory.c b/lib/librte_eal/freebsd/eal/eal_memory.c deleted file mode 100644 index a97d8f0f0c..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_memory.c +++ /dev/null @@ -1,536 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_memcfg.h" -#include "eal_options.h" - -#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE)) - -uint64_t eal_get_baseaddr(void) -{ - /* - * FreeBSD may allocate something in the space we will be mapping things - * before we get a chance to do that, so use a base address that's far - * away from where malloc() et al usually map things. - */ - return 0x1000000000ULL; -} - -/* - * Get physical address of any mapped virtual address in the current process. - */ -phys_addr_t -rte_mem_virt2phy(const void *virtaddr) -{ - /* XXX not implemented. This function is only used by - * rte_mempool_virt2iova() when hugepages are disabled. */ - (void)virtaddr; - return RTE_BAD_IOVA; -} -rte_iova_t -rte_mem_virt2iova(const void *virtaddr) -{ - return rte_mem_virt2phy(virtaddr); -} - -int -rte_eal_hugepage_init(void) -{ - struct rte_mem_config *mcfg; - uint64_t total_mem = 0; - void *addr; - unsigned int i, j, seg_idx = 0; - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - /* for debug purposes, hugetlbfs can be disabled */ - if (internal_config.no_hugetlbfs) { - struct rte_memseg_list *msl; - struct rte_fbarray *arr; - struct rte_memseg *ms; - uint64_t page_sz; - int n_segs, cur_seg; - - /* create a memseg list */ - msl = &mcfg->memsegs[0]; - - page_sz = RTE_PGSIZE_4K; - n_segs = internal_config.memory / page_sz; - - if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - addr = mmap(NULL, internal_config.memory, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, - strerror(errno)); - return -1; - } - msl->base_va = addr; - msl->page_sz = page_sz; - msl->len = internal_config.memory; - msl->socket_id = 0; - msl->heap = 1; - - /* populate memsegs. each memseg is 1 page long */ - for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { - arr = &msl->memseg_arr; - - ms = rte_fbarray_get(arr, cur_seg); - if (rte_eal_iova_mode() == RTE_IOVA_VA) - ms->iova = (uintptr_t)addr; - else - ms->iova = RTE_BAD_IOVA; - ms->addr = addr; - ms->hugepage_sz = page_sz; - ms->len = page_sz; - ms->socket_id = 0; - - rte_fbarray_set_used(arr, cur_seg); - - addr = RTE_PTR_ADD(addr, page_sz); - } - return 0; - } - - /* map all hugepages and sort them */ - for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ - struct hugepage_info *hpi; - rte_iova_t prev_end = 0; - int prev_ms_idx = -1; - uint64_t page_sz, mem_needed; - unsigned int n_pages, max_pages; - - hpi = &internal_config.hugepage_info[i]; - page_sz = hpi->hugepage_sz; - max_pages = hpi->num_pages[0]; - mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem, - page_sz); - - n_pages = RTE_MIN(mem_needed / page_sz, max_pages); - - for (j = 0; j < n_pages; j++) { - struct rte_memseg_list *msl; - struct rte_fbarray *arr; - struct rte_memseg *seg; - int msl_idx, ms_idx; - rte_iova_t physaddr; - int error; - size_t sysctl_size = sizeof(physaddr); - char physaddr_str[64]; - bool is_adjacent; - - /* first, check if this segment is IOVA-adjacent to - * the previous one. - */ - snprintf(physaddr_str, sizeof(physaddr_str), - "hw.contigmem.physaddr.%d", j); - error = sysctlbyname(physaddr_str, &physaddr, - &sysctl_size, NULL, 0); - if (error < 0) { - RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " - "from %s\n", j, hpi->hugedir); - return -1; - } - - is_adjacent = prev_end != 0 && physaddr == prev_end; - prev_end = physaddr + hpi->hugepage_sz; - - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; - msl_idx++) { - bool empty, need_hole; - msl = &mcfg->memsegs[msl_idx]; - arr = &msl->memseg_arr; - - if (msl->page_sz != page_sz) - continue; - - empty = arr->count == 0; - - /* we need a hole if this isn't an empty memseg - * list, and if previous segment was not - * adjacent to current one. - */ - need_hole = !empty && !is_adjacent; - - /* we need 1, plus hole if not adjacent */ - ms_idx = rte_fbarray_find_next_n_free(arr, - 0, 1 + (need_hole ? 1 : 0)); - - /* memseg list is full? */ - if (ms_idx < 0) - continue; - - if (need_hole && prev_ms_idx == ms_idx - 1) - ms_idx++; - prev_ms_idx = ms_idx; - - break; - } - if (msl_idx == RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), - RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); - return -1; - } - arr = &msl->memseg_arr; - seg = rte_fbarray_get(arr, ms_idx); - - addr = RTE_PTR_ADD(msl->base_va, - (size_t)msl->page_sz * ms_idx); - - /* address is already mapped in memseg list, so using - * MAP_FIXED here is safe. - */ - addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE, - MAP_SHARED | MAP_FIXED, - hpi->lock_descriptor, - j * EAL_PAGE_SIZE); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", - j, hpi->hugedir); - return -1; - } - - seg->addr = addr; - seg->iova = physaddr; - seg->hugepage_sz = page_sz; - seg->len = page_sz; - seg->nchannel = mcfg->nchannel; - seg->nrank = mcfg->nrank; - seg->socket_id = 0; - - rte_fbarray_set_used(arr, ms_idx); - - RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" - PRIx64", len %zu\n", - seg_idx++, addr, physaddr, page_sz); - - total_mem += seg->len; - } - if (total_mem >= internal_config.memory) - break; - } - if (total_mem < internal_config.memory) { - RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, " - "requested: %" PRIu64 "M " - "available: %" PRIu64 "M\n", - internal_config.memory >> 20, total_mem >> 20); - return -1; - } - return 0; -} - -struct attach_walk_args { - int fd_hugepage; - int seg_idx; -}; -static int -attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - void *arg) -{ - struct attach_walk_args *wa = arg; - void *addr; - - if (msl->external) - return 0; - - addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, wa->fd_hugepage, - wa->seg_idx * EAL_PAGE_SIZE); - if (addr == MAP_FAILED || addr != ms->addr) - return -1; - wa->seg_idx++; - - return 0; -} - -int -rte_eal_hugepage_attach(void) -{ - const struct hugepage_info *hpi; - int fd_hugepage = -1; - unsigned int i; - - hpi = &internal_config.hugepage_info[0]; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - const struct hugepage_info *cur_hpi = &hpi[i]; - struct attach_walk_args wa; - - memset(&wa, 0, sizeof(wa)); - - /* Obtain a file descriptor for contiguous memory */ - fd_hugepage = open(cur_hpi->hugedir, O_RDWR); - if (fd_hugepage < 0) { - RTE_LOG(ERR, EAL, "Could not open %s\n", - cur_hpi->hugedir); - goto error; - } - wa.fd_hugepage = fd_hugepage; - wa.seg_idx = 0; - - /* Map the contiguous memory into each memory segment */ - if (rte_memseg_walk(attach_segment, &wa) < 0) { - RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", - wa.seg_idx, cur_hpi->hugedir); - goto error; - } - - close(fd_hugepage); - fd_hugepage = -1; - } - - /* hugepage_info is no longer required */ - return 0; - -error: - if (fd_hugepage >= 0) - close(fd_hugepage); - return -1; -} - -int -rte_eal_using_phys_addrs(void) -{ - return 0; -} - -static uint64_t -get_mem_amount(uint64_t page_sz, uint64_t max_mem) -{ - uint64_t area_sz, max_pages; - - /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ - max_pages = RTE_MAX_MEMSEG_PER_LIST; - max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); - - area_sz = RTE_MIN(page_sz * max_pages, max_mem); - - /* make sure the list isn't smaller than the page size */ - area_sz = RTE_MAX(area_sz, page_sz); - - return RTE_ALIGN(area_sz, page_sz); -} - -#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" -static int -alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, - int n_segs, int socket_id, int type_msl_idx) -{ - char name[RTE_FBARRAY_NAME_LEN]; - - snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, - type_msl_idx); - if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", - rte_strerror(rte_errno)); - return -1; - } - - msl->page_sz = page_sz; - msl->socket_id = socket_id; - msl->base_va = NULL; - - RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", - (size_t)page_sz >> 10, socket_id); - - return 0; -} - -static int -alloc_va_space(struct rte_memseg_list *msl) -{ - uint64_t page_sz; - size_t mem_sz; - void *addr; - int flags = 0; - -#ifdef RTE_ARCH_PPC_64 - flags |= MAP_HUGETLB; -#endif - - page_sz = msl->page_sz; - mem_sz = page_sz * msl->memseg_arr.len; - - addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); - if (addr == NULL) { - if (rte_errno == EADDRNOTAVAIL) - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " - "please use '--" OPT_BASE_VIRTADDR "' option\n", - (unsigned long long)mem_sz, msl->base_va); - else - RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); - return -1; - } - msl->base_va = addr; - msl->len = mem_sz; - - return 0; -} - - -static int -memseg_primary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int hpi_idx, msl_idx = 0; - struct rte_memseg_list *msl; - uint64_t max_mem, total_mem; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* FreeBSD has an issue where core dump will dump the entire memory - * contents, including anonymous zero-page memory. Therefore, while we - * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will - * also be further limiting total memory amount to whatever memory is - * available to us through contigmem driver (plus spacing blocks). - * - * so, at each stage, we will be checking how much memory we are - * preallocating, and adjust all the values accordingly. - */ - - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - total_mem = 0; - - /* create memseg lists */ - for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; - hpi_idx++) { - uint64_t max_type_mem, total_type_mem = 0; - uint64_t avail_mem; - int type_msl_idx, max_segs, avail_segs, total_segs = 0; - struct hugepage_info *hpi; - uint64_t hugepage_sz; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - /* no NUMA support on FreeBSD */ - - /* check if we've already exceeded total memory amount */ - if (total_mem >= max_mem) - break; - - /* first, calculate theoretical limits according to config */ - max_type_mem = RTE_MIN(max_mem - total_mem, - (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); - max_segs = RTE_MAX_MEMSEG_PER_TYPE; - - /* now, limit all of that to whatever will actually be - * available to us, because without dynamic allocation support, - * all of that extra memory will be sitting there being useless - * and slowing down core dumps in case of a crash. - * - * we need (N*2)-1 segments because we cannot guarantee that - * each segment will be IOVA-contiguous with the previous one, - * so we will allocate more and put spaces inbetween segments - * that are non-contiguous. - */ - avail_segs = (hpi->num_pages[0] * 2) - 1; - avail_mem = avail_segs * hugepage_sz; - - max_type_mem = RTE_MIN(avail_mem, max_type_mem); - max_segs = RTE_MIN(avail_segs, max_segs); - - type_msl_idx = 0; - while (total_type_mem < max_type_mem && - total_segs < max_segs) { - uint64_t cur_max_mem, cur_mem; - unsigned int n_segs; - - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - msl = &mcfg->memsegs[msl_idx++]; - - cur_max_mem = max_type_mem - total_type_mem; - - cur_mem = get_mem_amount(hugepage_sz, - cur_max_mem); - n_segs = cur_mem / hugepage_sz; - - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - 0, type_msl_idx)) - return -1; - - total_segs += msl->memseg_arr.len; - total_type_mem = total_segs * hugepage_sz; - type_msl_idx++; - - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - return -1; - } - } - total_mem += total_type_mem; - } - return 0; -} - -static int -memseg_secondary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx = 0; - struct rte_memseg_list *msl; - - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - - msl = &mcfg->memsegs[msl_idx]; - - /* skip empty memseg lists */ - if (msl->memseg_arr.len == 0) - continue; - - if (rte_fbarray_attach(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); - return -1; - } - - /* preallocate VA space */ - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); - return -1; - } - } - - return 0; -} - -int -rte_eal_memseg_init(void) -{ - return rte_eal_process_type() == RTE_PROC_PRIMARY ? - memseg_primary_init() : - memseg_secondary_init(); -} diff --git a/lib/librte_eal/freebsd/eal/eal_thread.c b/lib/librte_eal/freebsd/eal/eal_thread.c deleted file mode 100644 index 309b587266..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_thread.c +++ /dev/null @@ -1,177 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" - -RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; -RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; -RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); - -/* - * Send a message to a slave lcore identified by slave_id to call a - * function f with argument arg. Once the execution is done, the - * remote lcore switch in FINISHED state. - */ -int -rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) -{ - int n; - char c = 0; - int m2s = lcore_config[slave_id].pipe_master2slave[1]; - int s2m = lcore_config[slave_id].pipe_slave2master[0]; - - if (lcore_config[slave_id].state != WAIT) - return -EBUSY; - - lcore_config[slave_id].f = f; - lcore_config[slave_id].arg = arg; - - /* send message */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(m2s, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - /* wait ack */ - do { - n = read(s2m, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - return 0; -} - -/* set affinity for current thread */ -static int -eal_thread_set_affinity(void) -{ - unsigned lcore_id = rte_lcore_id(); - - /* acquire system unique id */ - rte_gettid(); - - /* update EAL thread core affinity */ - return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); -} - -void eal_thread_init_master(unsigned lcore_id) -{ - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); -} - -/* main loop of threads */ -__attribute__((noreturn)) void * -eal_thread_loop(__attribute__((unused)) void *arg) -{ - char c; - int n, ret; - unsigned lcore_id; - pthread_t thread_id; - int m2s, s2m; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - - thread_id = pthread_self(); - - /* retrieve our lcore_id from the configuration structure */ - RTE_LCORE_FOREACH_SLAVE(lcore_id) { - if (thread_id == lcore_config[lcore_id].thread_id) - break; - } - if (lcore_id == RTE_MAX_LCORE) - rte_panic("cannot retrieve lcore id\n"); - - m2s = lcore_config[lcore_id].pipe_master2slave[0]; - s2m = lcore_config[lcore_id].pipe_slave2master[1]; - - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n", - lcore_id, thread_id, cpuset, ret == 0 ? "" : "..."); - - /* read on our pipe to get commands */ - while (1) { - void *fct_arg; - - /* wait command */ - do { - n = read(m2s, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - lcore_config[lcore_id].state = RUNNING; - - /* send ack */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(s2m, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - if (lcore_config[lcore_id].f == NULL) - rte_panic("NULL function pointer\n"); - - /* call the function and store the return value */ - fct_arg = lcore_config[lcore_id].arg; - ret = lcore_config[lcore_id].f(fct_arg); - lcore_config[lcore_id].ret = ret; - rte_wmb(); - lcore_config[lcore_id].state = FINISHED; - } - - /* never reached */ - /* pthread_exit(NULL); */ - /* return NULL; */ -} - -/* require calling thread tid by gettid() */ -int rte_sys_gettid(void) -{ - long lwpid; - thr_self(&lwpid); - return (int)lwpid; -} - -int rte_thread_setname(pthread_t id, const char *name) -{ - /* this BSD function returns no error */ - pthread_set_name_np(id, name); - return 0; -} diff --git a/lib/librte_eal/freebsd/eal/eal_timer.c b/lib/librte_eal/freebsd/eal/eal_timer.c deleted file mode 100644 index beff755a47..0000000000 --- a/lib/librte_eal/freebsd/eal/eal_timer.c +++ /dev/null @@ -1,64 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_internal_cfg.h" - -#ifdef RTE_LIBEAL_USE_HPET -#warning HPET is not supported in FreeBSD -#endif - -enum timer_source eal_timer_source = EAL_TIMER_TSC; - -uint64_t -get_tsc_freq(void) -{ - size_t sz; - int tmp; - uint64_t tsc_hz; - - sz = sizeof(tmp); - tmp = 0; - - if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0)) - RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); - else if (tmp != 1) - RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n"); - - tmp = 0; - - if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0)) - RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); - else if (tmp != 1) - RTE_LOG(WARNING, EAL, "TSC is not invariant\n"); - - sz = sizeof(tsc_hz); - if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) { - RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); - return 0; - } - - return tsc_hz; -} - -int -rte_eal_timer_init(void) -{ - set_tsc_freq(); - return 0; -} diff --git a/lib/librte_eal/freebsd/eal/include/rte_os.h b/lib/librte_eal/freebsd/eal/include/rte_os.h deleted file mode 100644 index eeb750cd81..0000000000 --- a/lib/librte_eal/freebsd/eal/include/rte_os.h +++ /dev/null @@ -1,52 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2019 Intel Corporation - */ - -#ifndef _RTE_OS_H_ -#define _RTE_OS_H_ - -/** - * This is header should contain any function/macro definition - * which are not supported natively or named differently in the - * freebsd OS. Functions will be added in future releases. - */ - -#include - -typedef cpuset_t rte_cpuset_t; -#define RTE_CPU_AND(dst, src1, src2) do \ -{ \ - cpuset_t tmp; \ - CPU_COPY(src1, &tmp); \ - CPU_AND(&tmp, src2); \ - CPU_COPY(&tmp, dst); \ -} while (0) -#define RTE_CPU_OR(dst, src1, src2) do \ -{ \ - cpuset_t tmp; \ - CPU_COPY(src1, &tmp); \ - CPU_OR(&tmp, src2); \ - CPU_COPY(&tmp, dst); \ -} while (0) -#define RTE_CPU_FILL(set) CPU_FILL(set) - -/* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */ -#ifdef CPU_NAND -#define RTE_CPU_NOT(dst, src) do \ -{ \ - cpuset_t tmp; \ - CPU_FILL(&tmp); \ - CPU_NAND(&tmp, src); \ - CPU_COPY(&tmp, dst); \ -} while (0) -#else -#define RTE_CPU_NOT(dst, src) do \ -{ \ - cpuset_t tmp; \ - CPU_FILL(&tmp); \ - CPU_ANDNOT(&tmp, src); \ - CPU_COPY(&tmp, dst); \ -} while (0) -#endif - -#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/freebsd/eal/meson.build b/lib/librte_eal/freebsd/eal/meson.build deleted file mode 100644 index 1426f7e5f1..0000000000 --- a/lib/librte_eal/freebsd/eal/meson.build +++ /dev/null @@ -1,22 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation - -env_objs = [] -env_headers = files( - 'include/rte_os.h', -) -env_sources = files('eal_alarm.c', - 'eal_cpuflags.c', - 'eal_debug.c', - 'eal_hugepage_info.c', - 'eal_interrupts.c', - 'eal_lcore.c', - 'eal_memalloc.c', - 'eal_thread.c', - 'eal_timer.c', - 'eal.c', - 'eal_memory.c', - 'eal_dev.c' -) - -deps += ['kvargs'] diff --git a/lib/librte_eal/freebsd/eal_alarm.c b/lib/librte_eal/freebsd/eal_alarm.c new file mode 100644 index 0000000000..51ea4b8c08 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_alarm.c @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define NS_PER_US 1000 + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct rte_intr_handle handle; + struct timespec time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + + /* on FreeBSD, timers don't use fd's, and their identifiers are stored + * in separate namespace from fd's, so using any value is OK. however, + * EAL interrupts handler expects fd's to be unique, so use an actual fd + * to guarantee unique timer identifier. + */ + intr_handle.fd = open("/dev/zero", O_RDONLY); + + return 0; +} + +static inline int +timespec_cmp(const struct timespec *now, const struct timespec *at) +{ + if (now->tv_sec < at->tv_sec) + return -1; + if (now->tv_sec > at->tv_sec) + return 1; + if (now->tv_nsec < at->tv_nsec) + return -1; + if (now->tv_nsec > at->tv_nsec) + return 1; + return 0; +} + +static inline uint64_t +diff_ns(struct timespec *now, struct timespec *at) +{ + uint64_t now_ns, at_ns; + + if (timespec_cmp(now, at) >= 0) + return 0; + + now_ns = now->tv_sec * NS_PER_S + now->tv_nsec; + at_ns = at->tv_sec * NS_PER_S + at->tv_nsec; + + return at_ns - now_ns; +} + +int +eal_alarm_get_timeout_ns(uint64_t *val) +{ + struct alarm_entry *ap; + struct timespec now; + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return -1; + + if (LIST_EMPTY(&alarm_list)) + return -1; + + ap = LIST_FIRST(&alarm_list); + + *val = diff_ns(&now, &ap->time); + + return 0; +} + +static int +unregister_current_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + do { + ret = rte_intr_callback_unregister(&intr_handle, + eal_alarm_callback, &ap->time); + } while (ret == -EAGAIN); + } + + return ret; +} + +static int +register_first_callback(void) +{ + struct alarm_entry *ap; + int ret = 0; + + if (!LIST_EMPTY(&alarm_list)) { + ap = LIST_FIRST(&alarm_list); + + /* register a new callback */ + ret = rte_intr_callback_register(&intr_handle, + eal_alarm_callback, &ap->time); + } + return ret; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + ap = LIST_FIRST(&alarm_list); + + if (clock_gettime(CLOCK_TYPE_ID, &now) < 0) + return; + + while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + + ap = LIST_FIRST(&alarm_list); + } + + /* timer has been deleted from the kqueue, so recreate it if needed */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); +} + + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *new_alarm; + struct timespec now; + uint64_t ns; + int ret = 0; + + /* check parameters, also ensure us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + ns = us * NS_PER_US; + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S; + new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (timespec_cmp(&new_alarm->time, &ap->time) < 0) { + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + /* re-register first callback just in case */ + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while (1) { + ap = LIST_FIRST(&alarm_list); + if (ap == NULL) + break; + if (cb_fn != ap->cb_fn) + break; + if (cb_arg != ap->cb_arg && cb_arg != (void *) -1) + break; + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that + * alarm is executing so loop can spin till it + * finish. Otherwise we are trying to cancel + * ourselves - mark it by EINPROGRESS. + */ + if (pthread_equal(ap->executing_id, + pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || + cb_arg == ap->cb_arg)) { + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, + pthread_self()) == 0) { + executing++; + } else { + err = EINPROGRESS; + } + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + rte_spinlock_lock(&alarm_list_lk); + + /* unregister if no alarms left, otherwise re-register first */ + if (LIST_EMPTY(&alarm_list)) + unregister_current_callback(); + else + register_first_callback(); + + rte_spinlock_unlock(&alarm_list_lk); + + return count; +} diff --git a/lib/librte_eal/freebsd/eal_alarm_private.h b/lib/librte_eal/freebsd/eal_alarm_private.h new file mode 100644 index 0000000000..65c7115184 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_alarm_private.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#ifndef EAL_ALARM_PRIVATE_H +#define EAL_ALARM_PRIVATE_H + +#include + +/* + * FreeBSD needs a back-channel communication mechanism between interrupt and + * alarm thread, because on FreeBSD, timer period is set up inside the interrupt + * API and not inside alarm API like on Linux. + */ + +int +eal_alarm_get_timeout_ns(uint64_t *val); + +#endif // EAL_ALARM_PRIVATE_H diff --git a/lib/librte_eal/freebsd/eal_cpuflags.c b/lib/librte_eal/freebsd/eal_cpuflags.c new file mode 100644 index 0000000000..69b161ea65 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_cpuflags.c @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Mellanox Technologies, Ltd + */ + +#include +#include + +unsigned long +rte_cpu_getauxval(unsigned long type __rte_unused) +{ + /* not implemented */ + return 0; +} + +int +rte_cpu_strcmp_auxval(unsigned long type __rte_unused, + const char *str __rte_unused) +{ + /* not implemented */ + return -1; +} diff --git a/lib/librte_eal/freebsd/eal_debug.c b/lib/librte_eal/freebsd/eal_debug.c new file mode 100644 index 0000000000..5d92500bf5 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/lib/librte_eal/freebsd/eal_dev.c b/lib/librte_eal/freebsd/eal_dev.c new file mode 100644 index 0000000000..8e06e70890 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_dev.c @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include + +int +rte_dev_event_monitor_start(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_event_monitor_stop(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_hotplug_handle_enable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} + +int +rte_dev_hotplug_handle_disable(void) +{ + RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n"); + return -1; +} diff --git a/lib/librte_eal/freebsd/eal_hugepage_info.c b/lib/librte_eal/freebsd/eal_hugepage_info.c new file mode 100644 index 0000000000..32012e1427 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_hugepage_info.c @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include + +#include +#include +#include "eal_hugepages.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" + +#define CONTIGMEM_DEV "/dev/contigmem" + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +/* + * No hugepage support on freebsd, but we dummy it, using contigmem driver + */ +int +eal_hugepage_info_init(void) +{ + size_t sysctl_size; + int num_buffers, fd, error; + int64_t buffer_size; + /* re-use the linux "internal config" structure for our memory data */ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + unsigned int i; + + internal_config.num_hugepage_sizes = 1; + + sysctl_size = sizeof(num_buffers); + error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n"); + return -1; + } + + sysctl_size = sizeof(buffer_size); + error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size, + &sysctl_size, NULL, 0); + + if (error != 0) { + RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n"); + return -1; + } + + fd = open(CONTIGMEM_DEV, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n"); + return -1; + } + + if (buffer_size >= 1<<30) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n", + num_buffers, (int)(buffer_size>>30)); + else if (buffer_size >= 1<<20) + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n", + num_buffers, (int)(buffer_size>>20)); + else + RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n", + num_buffers, (int)(buffer_size>>10)); + + strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir)); + hpi->hugepage_sz = buffer_size; + hpi->num_pages[0] = num_buffers; + hpi->lock_descriptor = fd; + + /* for no shared files mode, do not create shared memory config */ + if (internal_config.no_shconf) + return 0; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL ) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + + return 0; +} + +/* copy stuff from shared info into internal config */ +int +eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + internal_config.num_hugepage_sizes = 1; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/lib/librte_eal/freebsd/eal_interrupts.c b/lib/librte_eal/freebsd/eal_interrupts.c new file mode 100644 index 0000000000..00991f26a9 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_interrupts.c @@ -0,0 +1,685 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_alarm_private.h" + +#define MAX_INTR_EVENTS 16 + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ + uint8_t pending_delete; /**< delete after callback is called */ + rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +static volatile int kq = -1; + +static int +intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke) +{ + /* alarm callbacks are special case */ + if (ih->type == RTE_INTR_HANDLE_ALARM) { + uint64_t timeout_ns; + + /* get soonest alarm timeout */ + if (eal_alarm_get_timeout_ns(&timeout_ns) < 0) + return -1; + + ke->filter = EVFILT_TIMER; + /* timers are one shot */ + ke->flags |= EV_ONESHOT; + ke->fflags = NOTE_NSECONDS; + ke->data = timeout_ns; + } else { + ke->filter = EVFILT_READ; + } + ke->ident = ih->fd; + + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + struct rte_intr_callback *callback; + struct rte_intr_source *src; + int ret, add_event = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* find the source for this intr_handle */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) + break; + } + + /* if this is an alarm interrupt and it already has a callback, + * then we don't want to create a new callback because the only + * thing on the list should be eal_alarm_callback() and we may + * be called just to reset the timer. + */ + if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM && + !TAILQ_EMPTY(&src->callbacks)) { + callback = NULL; + } else { + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + callback->pending_delete = 0; + callback->ucb_fn = NULL; + + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + } + } + + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + add_event = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + } + + /* add events to the queue. timer events are special as we need to + * re-set the timer. + */ + if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) { + struct kevent ke; + + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_ADD; /* mark for addition to the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n"); + ret = -ENODEV; + goto fail; + } + + /** + * add the intr file descriptor into wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + /* currently, nic_uio does not support interrupts, so + * this error will always be triggered and output to the + * user. so, don't output it unless debug log level set. + */ + if (errno == ENODEV) + RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n", + src->intr_handle.fd); + else + RTE_LOG(ERR, EAL, "Error adding fd %d " + "kevent, %s\n", + src->intr_handle.fd, + strerror(errno)); + ret = -errno; + goto fail; + } + } + rte_spinlock_unlock(&intr_lock); + + return 0; +fail: + /* clean up */ + if (src != NULL) { + if (callback != NULL) + TAILQ_REMOVE(&(src->callbacks), callback, next); + if (TAILQ_EMPTY(&(src->callbacks))) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + free(callback); + rte_spinlock_unlock(&intr_lock); + return ret; +} + +int +rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg, + rte_intr_unregister_callback_fn ucb_fn) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* only usable if the source is active */ + } else if (src->active == 0) { + ret = -EAGAIN; + + } else { + ret = 0; + + /* walk through the callbacks and mark all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + cb->pending_delete = 1; + cb->ucb_fn = ucb_fn; + ret++; + } + } + } + + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + if (kq < 0) { + RTE_LOG(ERR, EAL, "Kqueue is not active\n"); + return -ENODEV; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + struct kevent ke; + + ret = 0; + + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + ke.flags = EV_DELETE; /* mark for deletion from the queue */ + + if (intr_source_to_kevent(intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + ret = -ENODEV; + goto out; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n", + src->intr_handle.fd, strerror(errno)); + /* removing non-existent even is an expected condition + * in some circumstances (e.g. oneshot events). + */ + } + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } +out: + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_ack(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + return -1; +} + +static void +eal_intr_process_interrupts(struct kevent *events, int nfds) +{ + struct rte_intr_callback active_cb; + union rte_intr_read_buffer buf; + struct rte_intr_callback *cb, *next; + struct rte_intr_source *src; + bool call = false; + int n, bytes_read; + struct kevent ke; + + for (n = 0; n < nfds; n++) { + int event_fd = events[n].ident; + + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == event_fd) + break; + if (src == NULL) { + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_ALARM: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(event_fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + event_fd, + strerror(errno)); + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", event_fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + + /* we done with that interrupt source, release it. */ + src->active = 0; + + /* check if any callback are supposed to be removed */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->pending_delete) { + /* remove it from the kqueue */ + memset(&ke, 0, sizeof(ke)); + /* mark for deletion from the queue */ + ke.flags = EV_DELETE; + + if (intr_source_to_kevent(&src->intr_handle, &ke) < 0) { + RTE_LOG(ERR, EAL, "Cannot convert to kevent\n"); + rte_spinlock_unlock(&intr_lock); + return; + } + + /** + * remove intr file descriptor from wait list. + */ + if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) { + RTE_LOG(ERR, EAL, "Error removing fd %d kevent, " + "%s\n", src->intr_handle.fd, + strerror(errno)); + /* removing non-existent even is an expected + * condition in some circumstances + * (e.g. oneshot events). + */ + } + + TAILQ_REMOVE(&src->callbacks, cb, next); + if (cb->ucb_fn) + cb->ucb_fn(&src->intr_handle, cb->cb_arg); + free(cb); + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + + rte_spinlock_unlock(&intr_lock); + } +} + +static void * +eal_intr_thread_main(void *arg __rte_unused) +{ + struct kevent events[MAX_INTR_EVENTS]; + int nfds; + + /* host thread, never break out */ + for (;;) { + /* do not change anything, just wait */ + nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL); + + /* kevent fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "kevent returns with fail\n"); + break; + } + /* kevent timeout, will never happen here */ + else if (nfds == 0) + continue; + + /* kevent has at least one fd ready to read */ + eal_intr_process_interrupts(events, nfds); + } + close(kq); + kq = -1; + return NULL; +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + kq = kqueue(); + if (kq < 0) { + RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n"); + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, + int epfd, int op, unsigned int vec, void *data) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(vec); + RTE_SET_USED(data); + + return -ENOTSUP; +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + RTE_SET_USED(intr_handle); + RTE_SET_USED(nb_efd); + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 1; +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); + return 0; +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(events); + RTE_SET_USED(maxevents); + RTE_SET_USED(timeout); + + return -ENOTSUP; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event) +{ + RTE_SET_USED(epfd); + RTE_SET_USED(op); + RTE_SET_USED(fd); + RTE_SET_USED(event); + + return -ENOTSUP; +} + +int +rte_intr_tls_epfd(void) +{ + return -ENOTSUP; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + RTE_SET_USED(intr_handle); +} + +int rte_thread_is_intr(void) +{ + return pthread_equal(intr_thread, pthread_self()); +} diff --git a/lib/librte_eal/freebsd/eal_lcore.c b/lib/librte_eal/freebsd/eal_lcore.c new file mode 100644 index 0000000000..d9ef4bc9c5 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_lcore.c @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +/* No topology information available on FreeBSD including NUMA info */ +unsigned +eal_cpu_core_id(__rte_unused unsigned lcore_id) +{ + return 0; +} + +static int +eal_get_ncpus(void) +{ + static int ncpu = -1; + int mib[2] = {CTL_HW, HW_NCPU}; + size_t len = sizeof(ncpu); + + if (ncpu < 0) { + sysctl(mib, 2, &ncpu, &len, NULL, 0); + RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu); + } + return ncpu; +} + +unsigned +eal_cpu_socket_id(__rte_unused unsigned cpu_id) +{ + return 0; +} + +/* Check if a cpu is present by the presence of the + * cpu information for it. + */ +int +eal_cpu_detected(unsigned lcore_id) +{ + const unsigned ncpus = eal_get_ncpus(); + return lcore_id < ncpus; +} diff --git a/lib/librte_eal/freebsd/eal_memalloc.c b/lib/librte_eal/freebsd/eal_memalloc.c new file mode 100644 index 0000000000..6893448db7 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_memalloc.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include + +#include +#include +#include + +#include "eal_memalloc.h" + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused, + int __rte_unused n_segs, size_t __rte_unused page_sz, + int __rte_unused socket, bool __rte_unused exact) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return NULL; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused, + int n_segs __rte_unused) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_sync_with_primary(void) +{ + RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n"); + return -1; +} + +int +eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused, + int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_set_seg_list_fd(int list_idx __rte_unused, int fd __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused, + int seg_idx __rte_unused, size_t *offset __rte_unused) +{ + return -ENOTSUP; +} + +int +eal_memalloc_init(void) +{ + return 0; +} diff --git a/lib/librte_eal/freebsd/eal_memory.c b/lib/librte_eal/freebsd/eal_memory.c new file mode 100644 index 0000000000..a97d8f0f0c --- /dev/null +++ b/lib/librte_eal/freebsd/eal_memory.c @@ -0,0 +1,536 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_memcfg.h" +#include "eal_options.h" + +#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE)) + +uint64_t eal_get_baseaddr(void) +{ + /* + * FreeBSD may allocate something in the space we will be mapping things + * before we get a chance to do that, so use a base address that's far + * away from where malloc() et al usually map things. + */ + return 0x1000000000ULL; +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + /* XXX not implemented. This function is only used by + * rte_mempool_virt2iova() when hugepages are disabled. */ + (void)virtaddr; + return RTE_BAD_IOVA; +} +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + return rte_mem_virt2phy(virtaddr); +} + +int +rte_eal_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + uint64_t total_mem = 0; + void *addr; + unsigned int i, j, seg_idx = 0; + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* for debug purposes, hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *ms; + uint64_t page_sz; + int n_segs, cur_seg; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + addr = mmap(NULL, internal_config.memory, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->len = internal_config.memory; + msl->socket_id = 0; + msl->heap = 1; + + /* populate memsegs. each memseg is 1 page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = page_sz; + ms->socket_id = 0; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, page_sz); + } + return 0; + } + + /* map all hugepages and sort them */ + for (i = 0; i < internal_config.num_hugepage_sizes; i ++){ + struct hugepage_info *hpi; + rte_iova_t prev_end = 0; + int prev_ms_idx = -1; + uint64_t page_sz, mem_needed; + unsigned int n_pages, max_pages; + + hpi = &internal_config.hugepage_info[i]; + page_sz = hpi->hugepage_sz; + max_pages = hpi->num_pages[0]; + mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem, + page_sz); + + n_pages = RTE_MIN(mem_needed / page_sz, max_pages); + + for (j = 0; j < n_pages; j++) { + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + struct rte_memseg *seg; + int msl_idx, ms_idx; + rte_iova_t physaddr; + int error; + size_t sysctl_size = sizeof(physaddr); + char physaddr_str[64]; + bool is_adjacent; + + /* first, check if this segment is IOVA-adjacent to + * the previous one. + */ + snprintf(physaddr_str, sizeof(physaddr_str), + "hw.contigmem.physaddr.%d", j); + error = sysctlbyname(physaddr_str, &physaddr, + &sysctl_size, NULL, 0); + if (error < 0) { + RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u " + "from %s\n", j, hpi->hugedir); + return -1; + } + + is_adjacent = prev_end != 0 && physaddr == prev_end; + prev_end = physaddr + hpi->hugepage_sz; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + bool empty, need_hole; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + + empty = arr->count == 0; + + /* we need a hole if this isn't an empty memseg + * list, and if previous segment was not + * adjacent to current one. + */ + need_hole = !empty && !is_adjacent; + + /* we need 1, plus hole if not adjacent */ + ms_idx = rte_fbarray_find_next_n_free(arr, + 0, 1 + (need_hole ? 1 : 0)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + if (need_hole && prev_ms_idx == ms_idx - 1) + ms_idx++; + prev_ms_idx = ms_idx; + + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + arr = &msl->memseg_arr; + seg = rte_fbarray_get(arr, ms_idx); + + addr = RTE_PTR_ADD(msl->base_va, + (size_t)msl->page_sz * ms_idx); + + /* address is already mapped in memseg list, so using + * MAP_FIXED here is safe. + */ + addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED, + hpi->lock_descriptor, + j * EAL_PAGE_SIZE); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + j, hpi->hugedir); + return -1; + } + + seg->addr = addr; + seg->iova = physaddr; + seg->hugepage_sz = page_sz; + seg->len = page_sz; + seg->nchannel = mcfg->nchannel; + seg->nrank = mcfg->nrank; + seg->socket_id = 0; + + rte_fbarray_set_used(arr, ms_idx); + + RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%" + PRIx64", len %zu\n", + seg_idx++, addr, physaddr, page_sz); + + total_mem += seg->len; + } + if (total_mem >= internal_config.memory) + break; + } + if (total_mem < internal_config.memory) { + RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, " + "requested: %" PRIu64 "M " + "available: %" PRIu64 "M\n", + internal_config.memory >> 20, total_mem >> 20); + return -1; + } + return 0; +} + +struct attach_walk_args { + int fd_hugepage; + int seg_idx; +}; +static int +attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + struct attach_walk_args *wa = arg; + void *addr; + + if (msl->external) + return 0; + + addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, wa->fd_hugepage, + wa->seg_idx * EAL_PAGE_SIZE); + if (addr == MAP_FAILED || addr != ms->addr) + return -1; + wa->seg_idx++; + + return 0; +} + +int +rte_eal_hugepage_attach(void) +{ + const struct hugepage_info *hpi; + int fd_hugepage = -1; + unsigned int i; + + hpi = &internal_config.hugepage_info[0]; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + const struct hugepage_info *cur_hpi = &hpi[i]; + struct attach_walk_args wa; + + memset(&wa, 0, sizeof(wa)); + + /* Obtain a file descriptor for contiguous memory */ + fd_hugepage = open(cur_hpi->hugedir, O_RDWR); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + cur_hpi->hugedir); + goto error; + } + wa.fd_hugepage = fd_hugepage; + wa.seg_idx = 0; + + /* Map the contiguous memory into each memory segment */ + if (rte_memseg_walk(attach_segment, &wa) < 0) { + RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n", + wa.seg_idx, cur_hpi->hugedir); + goto error; + } + + close(fd_hugepage); + fd_hugepage = -1; + } + + /* hugepage_info is no longer required */ + return 0; + +error: + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +int +rte_eal_using_phys_addrs(void) +{ + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + +#ifdef RTE_ARCH_PPC_64 + flags |= MAP_HUGETLB; +#endif + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " + "please use '--" OPT_BASE_VIRTADDR "' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + msl->len = mem_sz; + + return 0; +} + + +static int +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int hpi_idx, msl_idx = 0; + struct rte_memseg_list *msl; + uint64_t max_mem, total_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* FreeBSD has an issue where core dump will dump the entire memory + * contents, including anonymous zero-page memory. Therefore, while we + * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will + * also be further limiting total memory amount to whatever memory is + * available to us through contigmem driver (plus spacing blocks). + * + * so, at each stage, we will be checking how much memory we are + * preallocating, and adjust all the values accordingly. + */ + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + total_mem = 0; + + /* create memseg lists */ + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t max_type_mem, total_type_mem = 0; + uint64_t avail_mem; + int type_msl_idx, max_segs, avail_segs, total_segs = 0; + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* no NUMA support on FreeBSD */ + + /* check if we've already exceeded total memory amount */ + if (total_mem >= max_mem) + break; + + /* first, calculate theoretical limits according to config */ + max_type_mem = RTE_MIN(max_mem - total_mem, + (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20); + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + + /* now, limit all of that to whatever will actually be + * available to us, because without dynamic allocation support, + * all of that extra memory will be sitting there being useless + * and slowing down core dumps in case of a crash. + * + * we need (N*2)-1 segments because we cannot guarantee that + * each segment will be IOVA-contiguous with the previous one, + * so we will allocate more and put spaces inbetween segments + * that are non-contiguous. + */ + avail_segs = (hpi->num_pages[0] * 2) - 1; + avail_mem = avail_segs * hugepage_sz; + + max_type_mem = RTE_MIN(avail_mem, max_type_mem); + max_segs = RTE_MIN(avail_segs, max_segs); + + type_msl_idx = 0; + while (total_type_mem < max_type_mem && + total_segs < max_segs) { + uint64_t cur_max_mem, cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx++]; + + cur_max_mem = max_type_mem - total_type_mem; + + cur_mem = get_mem_amount(hugepage_sz, + cur_max_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + 0, type_msl_idx)) + return -1; + + total_segs += msl->memseg_arr.len; + total_type_mem = total_segs * hugepage_sz; + type_msl_idx++; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + return -1; + } + } + total_mem += total_type_mem; + } + return 0; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + return rte_eal_process_type() == RTE_PROC_PRIMARY ? + memseg_primary_init() : + memseg_secondary_init(); +} diff --git a/lib/librte_eal/freebsd/eal_thread.c b/lib/librte_eal/freebsd/eal_thread.c new file mode 100644 index 0000000000..309b587266 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_thread.c @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n", + lcore_id, thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + long lwpid; + thr_self(&lwpid); + return (int)lwpid; +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + /* this BSD function returns no error */ + pthread_set_name_np(id, name); + return 0; +} diff --git a/lib/librte_eal/freebsd/eal_timer.c b/lib/librte_eal/freebsd/eal_timer.c new file mode 100644 index 0000000000..beff755a47 --- /dev/null +++ b/lib/librte_eal/freebsd/eal_timer.c @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +#ifdef RTE_LIBEAL_USE_HPET +#warning HPET is not supported in FreeBSD +#endif + +enum timer_source eal_timer_source = EAL_TIMER_TSC; + +uint64_t +get_tsc_freq(void) +{ + size_t sz; + int tmp; + uint64_t tsc_hz; + + sz = sizeof(tmp); + tmp = 0; + + if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n"); + + tmp = 0; + + if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0)) + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + else if (tmp != 1) + RTE_LOG(WARNING, EAL, "TSC is not invariant\n"); + + sz = sizeof(tsc_hz); + if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) { + RTE_LOG(WARNING, EAL, "%s\n", strerror(errno)); + return 0; + } + + return tsc_hz; +} + +int +rte_eal_timer_init(void) +{ + set_tsc_freq(); + return 0; +} diff --git a/lib/librte_eal/freebsd/include/meson.build b/lib/librte_eal/freebsd/include/meson.build new file mode 100644 index 0000000000..7d18dd52f1 --- /dev/null +++ b/lib/librte_eal/freebsd/include/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_os.h', +) diff --git a/lib/librte_eal/freebsd/include/rte_os.h b/lib/librte_eal/freebsd/include/rte_os.h new file mode 100644 index 0000000000..eeb750cd81 --- /dev/null +++ b/lib/librte_eal/freebsd/include/rte_os.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * freebsd OS. Functions will be added in future releases. + */ + +#include + +typedef cpuset_t rte_cpuset_t; +#define RTE_CPU_AND(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_AND(&tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#define RTE_CPU_OR(dst, src1, src2) do \ +{ \ + cpuset_t tmp; \ + CPU_COPY(src1, &tmp); \ + CPU_OR(&tmp, src2); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#define RTE_CPU_FILL(set) CPU_FILL(set) + +/* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */ +#ifdef CPU_NAND +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpuset_t tmp; \ + CPU_FILL(&tmp); \ + CPU_NAND(&tmp, src); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#else +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpuset_t tmp; \ + CPU_FILL(&tmp); \ + CPU_ANDNOT(&tmp, src); \ + CPU_COPY(&tmp, dst); \ +} while (0) +#endif + +#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/freebsd/meson.build b/lib/librte_eal/freebsd/meson.build new file mode 100644 index 0000000000..5e6afd9d34 --- /dev/null +++ b/lib/librte_eal/freebsd/meson.build @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +subdir('include') + +sources += files('eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_lcore.c', + 'eal_memalloc.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal.c', + 'eal_memory.c', + 'eal_dev.c' +) + +deps += ['kvargs'] diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile new file mode 100644 index 0000000000..82c3fc570a --- /dev/null +++ b/lib/librte_eal/linux/Makefile @@ -0,0 +1,101 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2010-2019 Intel Corporation + +include $(RTE_SDK)/mk/rte.vars.mk + +LIB = librte_eal.a + +ARCH_DIR ?= $(RTE_ARCH) + +EXPORT_MAP := ../rte_eal_version.map +VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) + +VPATH += $(RTE_SDK)/lib/librte_eal/common + +CFLAGS += -DALLOW_EXPERIMENTAL_API +CFLAGS += -I$(SRCDIR)/include +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common +CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include +CFLAGS += $(WERROR_FLAGS) -O3 + +LDLIBS += -ldl +LDLIBS += -lpthread +LDLIBS += -lgcc_s +LDLIBS += -lrt +LDLIBS += -lrte_kvargs +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) +LDLIBS += -lnuma +endif + +# specific to linux exec-env +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) := eal.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_hugepage_info.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio_mp_sync.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_debug.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_interrupts.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_alarm.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_dev.c + +# from common dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_lcore.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_timer.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memzone.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_log.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_launch.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_mcfg.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memalloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memory.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_tailqs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_errno.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hypervisor.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_string_fns.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hexdump.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_devargs.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_class.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_bus.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_dev.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_options.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_thread.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_proc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_fbarray.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_uuid.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_malloc.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += hotplug_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_elem.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_heap.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_mp.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_keepalive.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_option.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_service.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_random.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c + +# from arch dir +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_hypervisor.c +SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c +SRCS-y += rte_cycles.c + +CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) + +# workaround for a gcc bug with noreturn attribute +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) +CFLAGS_eal_thread.o += -Wno-return-type +endif + +INC := rte_kni_common.h +INC += rte_os.h + +SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUX)-include := $(addprefix include/,$(INC)) + +include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/linux/eal.c b/lib/librte_eal/linux/eal.c new file mode 100644 index 0000000000..9530ee55f8 --- /dev/null +++ b/lib/librte_eal/linux/eal.c @@ -0,0 +1,1393 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation. + * Copyright(c) 2012-2014 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(RTE_ARCH_X86) +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_memcfg.h" +#include "eal_options.h" +#include "eal_vfio.h" +#include "hotplug_mp.h" + +#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) + +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + +#define KERNEL_IOMMU_GROUPS_PATH "/sys/kernel/iommu_groups" + +/* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook = NULL; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc */ +static int mem_cfg_fd = -1; + +static struct flock wr_lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_start = offsetof(struct rte_mem_config, memsegs), + .l_len = sizeof(early_mem_config.memsegs), +}; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* used by rte_rdtsc() */ +int rte_cycles_vmware_tsc_map; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +static const char *default_runtime_dir = "/var/run"; + +int +eal_create_runtime_dir(void) +{ + const char *directory = default_runtime_dir; + const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); + const char *fallback = "/tmp"; + char tmp[PATH_MAX]; + int ret; + + if (getuid() != 0) { + /* try XDG path first, fall back to /tmp */ + if (xdg_runtime_dir != NULL) + directory = xdg_runtime_dir; + else + directory = fallback; + } + /* create DPDK subdirectory under runtime dir */ + ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); + if (ret < 0 || ret == sizeof(tmp)) { + RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); + return -1; + } + + /* create prefix-specific subdirectory under DPDK runtime dir */ + ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", + tmp, eal_get_hugefile_prefix()); + if (ret < 0 || ret == sizeof(runtime_dir)) { + RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); + return -1; + } + + /* create the path if it doesn't exist. no "mkdir -p" here, so do it + * step by step. + */ + ret = mkdir(tmp, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + tmp, strerror(errno)); + return -1; + } + + ret = mkdir(runtime_dir, 0700); + if (ret < 0 && errno != EEXIST) { + RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", + runtime_dir, strerror(errno)); + return -1; + } + + return 0; +} + +int +eal_clean_runtime_dir(void) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + static const char * const filters[] = { + "fbarray_*", + "mp_socket_*" + }; + + /* open directory */ + dir = opendir(runtime_dir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n", + runtime_dir); + goto error; + } + dir_fd = dirfd(dir); + + /* lock the directory before doing anything, to avoid races */ + if (flock(dir_fd, LOCK_EX) < 0) { + RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n", + runtime_dir); + goto error; + } + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n", + runtime_dir); + goto error; + } + + while (dirent != NULL) { + unsigned int f_idx; + bool skip = true; + + /* skip files that don't match the patterns */ + for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) { + const char *filter = filters[f_idx]; + + if (fnmatch(filter, dirent->d_name, 0) == 0) { + skip = false; + break; + } + } + if (skip) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close(fd); + dirent = readdir(dir); + } + + /* closedir closes dir_fd and drops the lock */ + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n", + strerror(errno)); + + return -1; +} + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return user provided mbuf pool ops name */ +const char * +rte_eal_mbuf_user_pool_ops(void) +{ + return internal_config.user_mbuf_pool_ops_name; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +enum rte_iova_mode +rte_eal_iova_mode(void) +{ + return rte_eal_get_configuration()->iova_mode; +} + +/* parse a sysfs (or other) file containing one integer value */ +int +eal_parse_sysfs_value(const char *filename, unsigned long *val) +{ + FILE *f; + char buf[BUFSIZ]; + char *end = NULL; + + if ((f = fopen(filename, "r")) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", + __func__, filename); + return -1; + } + + if (fgets(buf, sizeof(buf), f) == NULL) { + RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + *val = strtoul(buf, &end, 0); + if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { + RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", + __func__, filename); + fclose(f); + return -1; + } + fclose(f); + return 0; +} + + +/* create memory configuration in shared/mmap memory. Take out + * a write lock on the memsegs, so we can auto-detect primary/secondary. + * This means we never close the file while running (auto-close on exit). + * We also don't lock the whole file, so that in future we can use read-locks + * on other parts, e.g. memzones, to detect if there are running secondary + * processes. */ +static int +rte_eal_config_create(void) +{ + size_t page_sz = sysconf(_SC_PAGE_SIZE); + size_t cfg_len = sizeof(*rte_config.mem_config); + size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); + void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; + int retval; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + /* map the config before hugepage address so that we don't waste a page */ + if (internal_config.base_virtaddr != 0) + rte_mem_cfg_addr = (void *) + RTE_ALIGN_FLOOR(internal_config.base_virtaddr - + sizeof(struct rte_mem_config), page_sz); + else + rte_mem_cfg_addr = NULL; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + retval = ftruncate(mem_cfg_fd, cfg_len); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", + pathname); + return -1; + } + + retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); + if (retval < 0){ + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " + "process running?\n", pathname); + return -1; + } + + /* reserve space for config */ + rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, + &cfg_len_aligned, page_sz, 0, 0); + if (rte_mem_cfg_addr == NULL) { + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); + close(mem_cfg_fd); + mem_cfg_fd = -1; + return -1; + } + + /* remap the actual file into the space we've just reserved */ + mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, + cfg_len_aligned, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); + if (mapped_mem_cfg_addr == MAP_FAILED) { + munmap(rte_mem_cfg_addr, cfg_len); + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); + return -1; + } + + memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); + rte_config.mem_config = rte_mem_cfg_addr; + + /* store address of the config in the config itself so that secondary + * processes could later map the config into this exact location */ + rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; + + rte_config.mem_config->dma_maskbits = 0; + + return 0; +} + +/* attach to an existing shared memory config */ +static int +rte_eal_config_attach(void) +{ + struct rte_mem_config *mem_config; + + const char *pathname = eal_runtime_config_path(); + + if (internal_config.no_shconf) + return 0; + + if (mem_cfg_fd < 0){ + mem_cfg_fd = open(pathname, O_RDWR); + if (mem_cfg_fd < 0) { + RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", + pathname); + return -1; + } + } + + /* map it as read-only first */ + mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), + PROT_READ, MAP_SHARED, mem_cfg_fd, 0); + if (mem_config == MAP_FAILED) { + close(mem_cfg_fd); + mem_cfg_fd = -1; + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* reattach the shared config at exact memory location primary process has it */ +static int +rte_eal_config_reattach(void) +{ + struct rte_mem_config *mem_config; + void *rte_mem_cfg_addr; + + if (internal_config.no_shconf) + return 0; + + /* save the address primary process has mapped shared config to */ + rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; + + /* unmap original config */ + munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); + + /* remap the config at proper address */ + mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, + sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, + mem_cfg_fd, 0); + + close(mem_cfg_fd); + mem_cfg_fd = -1; + + if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { + if (mem_config != MAP_FAILED) { + /* errno is stale, don't use */ + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" + " - please use '--" OPT_BASE_VIRTADDR + "' option\n", rte_mem_cfg_addr, mem_config); + munmap(mem_config, sizeof(struct rte_mem_config)); + return -1; + } + RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + rte_config.mem_config = mem_config; + + return 0; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if there no shared config, there can be no secondary processes */ + if (!internal_config.no_shconf) { + /* if we can open the file but not get a write-lock we are a + * secondary process. NOTE: if we get a file handle back, we + * keep that open and don't close it to prevent a race condition + * between multiple opens. + */ + if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && + (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* Sets up rte_config structure with the pointer to shared memory config.*/ +static int +rte_config_init(void) +{ + rte_config.process_type = internal_config.process_type; + + switch (rte_config.process_type){ + case RTE_PROC_PRIMARY: + if (rte_eal_config_create() < 0) + return -1; + eal_mcfg_update_from_internal(); + break; + case RTE_PROC_SECONDARY: + if (rte_eal_config_attach() < 0) + return -1; + eal_mcfg_wait_complete(); + if (eal_mcfg_check_version() < 0) { + RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); + return -1; + } + if (rte_eal_config_reattach() < 0) + return -1; + eal_mcfg_update_internal(); + break; + case RTE_PROC_AUTO: + case RTE_PROC_INVALID: + RTE_LOG(ERR, EAL, "Invalid process type %d\n", + rte_config.process_type); + return -1; + } + + return 0; +} + +/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ +static void +eal_hugedirs_unlock(void) +{ + int i; + + for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) + { + /* skip uninitialized */ + if (internal_config.hugepage_info[i].lock_descriptor < 0) + continue; + /* unlock hugepage file */ + flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); + close(internal_config.hugepage_info[i].lock_descriptor); + /* reset the field */ + internal_config.hugepage_info[i].lock_descriptor = -1; + } +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + printf("EAL Linux options:\n" + " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" + " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" + " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" + " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" + " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" + " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" + " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" + " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" + " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n" + "\n"); + /* Allow the application to print its usage message too if hook is set */ + if ( rte_application_usage_hook ) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Set a per-application usage message */ +rte_usage_hook_t +rte_set_application_usage_hook( rte_usage_hook_t usage_func ) +{ + rte_usage_hook_t old_func; + + /* Will be NULL on the first call to denote the last usage routine. */ + old_func = rte_application_usage_hook; + rte_application_usage_hook = usage_func; + + return old_func; +} + +static int +eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(strval, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(strval[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(strval, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + uint64_t val; + end = NULL; + val = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + val <<= 20; + total_mem += val; + socket_arg[i] = val; + } + + return 0; +} + +static int +eal_parse_vfio_intr(const char *mode) +{ + unsigned i; + static struct { + const char *name; + enum rte_intr_mode value; + } map[] = { + { "legacy", RTE_INTR_MODE_LEGACY }, + { "msi", RTE_INTR_MODE_MSI }, + { "msix", RTE_INTR_MODE_MSIX }, + }; + + for (i = 0; i < RTE_DIM(map); i++) { + if (!strcmp(mode, map[i].name)) { + internal_config.vfio_intr_mode = map[i].value; + return 0; + } + } + return -1; +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; +} + +/* Parse the argument given in the command line of the application */ +static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + const int old_optind = optind; + const int old_optopt = optopt; + char * const old_optarg = optarg; + + argvopt = argv; + optind = 1; + opterr = 0; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + /* + * getopt didn't recognise the option, lets parse the + * registered options to see if the flag is valid + */ + if (opt == '?') { + ret = rte_option_parse(argv[optind-1]); + if (ret == 0) + continue; + + eal_usage(prgname); + ret = -1; + goto out; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + + case OPT_HUGE_DIR_NUM: + { + char *hdir = strdup(optarg); + if (hdir == NULL) + RTE_LOG(ERR, EAL, "Could not store hugepage directory\n"); + else { + /* free old hugepage dir */ + if (internal_config.hugepage_dir != NULL) + free(internal_config.hugepage_dir); + internal_config.hugepage_dir = hdir; + } + break; + } + case OPT_FILE_PREFIX_NUM: + { + char *prefix = strdup(optarg); + if (prefix == NULL) + RTE_LOG(ERR, EAL, "Could not store file prefix\n"); + else { + /* free old prefix */ + if (internal_config.hugefile_prefix != NULL) + free(internal_config.hugefile_prefix); + internal_config.hugefile_prefix = prefix; + } + break; + } + case OPT_SOCKET_MEM_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_mem) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_sockets = 1; + break; + + case OPT_SOCKET_LIMIT_NUM: + if (eal_parse_socket_arg(optarg, + internal_config.socket_limit) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_LIMIT "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + internal_config.force_socket_limits = 1; + break; + + case OPT_VFIO_INTR_NUM: + if (eal_parse_vfio_intr(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_VFIO_INTR "\n"); + eal_usage(prgname); + ret = -1; + goto out; + } + break; + + case OPT_CREATE_UIO_DEV_NUM: + internal_config.create_uio_dev = 1; + break; + + case OPT_MBUF_POOL_OPS_NAME_NUM: + { + char *ops_name = strdup(optarg); + if (ops_name == NULL) + RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); + else { + /* free old ops name */ + if (internal_config.user_mbuf_pool_ops_name != + NULL) + free(internal_config.user_mbuf_pool_ops_name); + + internal_config.user_mbuf_pool_ops_name = + ops_name; + } + break; + } + case OPT_MATCH_ALLOCATIONS_NUM: + internal_config.match_allocations = 1; + break; + + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Linux\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Linux\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Linux\n", opt); + } + eal_usage(prgname); + ret = -1; + goto out; + } + } + + /* create runtime data directory */ + if (internal_config.no_shconf == 0 && + eal_create_runtime_dir() < 0) { + RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); + ret = -1; + goto out; + } + + if (eal_adjust_config(&internal_config) != 0) { + ret = -1; + goto out; + } + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + ret = -1; + goto out; + } + + if (optind >= 0) + argv[optind-1] = prgname; + ret = optind-1; + +out: + /* restore getopt lib */ + optind = old_optind; + optopt = old_optopt; + optarg = old_optarg; + + return ret; +} + +static int +check_socket(const struct rte_memseg_list *msl, void *arg) +{ + int *socket_id = arg; + + if (msl->external) + return 0; + + return *socket_id == msl->socket_id; +} + +static void +eal_check_mem_on_local_socket(void) +{ + int socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + if (rte_memseg_list_walk(check_socket, &socket_id) == 0) + RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); +} + +static int +sync_func(__attribute__((unused)) void *arg) +{ + return 0; +} + +/* + * Request iopl privilege for all RPL, returns 0 on success + * iopl() call is mostly for the i386 architecture. For other architectures, + * return -1 to indicate IO privilege can't be changed in this way. + */ +int +rte_eal_iopl_init(void) +{ +#if defined(RTE_ARCH_X86) + if (iopl(3) != 0) + return -1; +#endif + return 0; +} + +#ifdef VFIO_PRESENT +static int rte_eal_vfio_setup(void) +{ + if (rte_vfio_enable("vfio")) + return -1; + + return 0; +} +#endif + +static void rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + +/* + * On Linux 3.6+, even if VFIO is not loaded, whenever IOMMU is enabled in the + * BIOS and in the kernel, /sys/kernel/iommu_groups path will contain kernel + * IOMMU groups. If IOMMU is not enabled, that path would be empty. + * Therefore, checking if the path is empty will tell us if IOMMU is enabled. + */ +static bool +is_iommu_enabled(void) +{ + DIR *dir = opendir(KERNEL_IOMMU_GROUPS_PATH); + struct dirent *d; + int n = 0; + + /* if directory doesn't exist, assume IOMMU is not enabled */ + if (dir == NULL) + return false; + + while ((d = readdir(dir)) != NULL) { + /* skip dot and dot-dot */ + if (++n > 2) + break; + } + closedir(dir); + + return n > 2; +} + +/* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret, ret; + pthread_t thread_id; + static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); + const char *p; + static char logid[PATH_MAX]; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + char thread_name[RTE_MAX_THREAD_NAME_LEN]; + bool phys_addrs; + + /* checks if the machine is adequate */ + if (!rte_cpu_is_supported()) { + rte_eal_init_alert("unsupported cpu type."); + rte_errno = ENOTSUP; + return -1; + } + + if (!rte_atomic32_test_and_set(&run_once)) { + rte_eal_init_alert("already called initialization."); + rte_errno = EALREADY; + return -1; + } + + p = strrchr(argv[0], '/'); + strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); + thread_id = pthread_self(); + + eal_reset_internal_config(&internal_config); + + /* set log level as early as possible */ + eal_log_level_parse(argc, argv); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) { + rte_eal_init_alert("Invalid 'command line' arguments."); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_plugins_init() < 0) { + rte_eal_init_alert("Cannot init plugins"); + rte_errno = EINVAL; + rte_atomic32_clear(&run_once); + return -1; + } + + if (eal_option_device_parse()) { + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + if (rte_config_init() < 0) { + rte_eal_init_alert("Cannot init config"); + return -1; + } + + if (rte_eal_intr_init() < 0) { + rte_eal_init_alert("Cannot init interrupt-handling thread"); + return -1; + } + + if (rte_eal_alarm_init() < 0) { + rte_eal_init_alert("Cannot init alarm"); + /* rte_eal_alarm_init sets rte_errno on failure. */ + return -1; + } + + /* Put mp channel init before bus scan so that we can init the vdev + * bus through mp channel in the secondary process before the bus scan. + */ + if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { + rte_eal_init_alert("failed to init mp channel"); + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_errno = EFAULT; + return -1; + } + } + + /* register multi-process action callbacks for hotplug */ + if (eal_mp_dev_hotplug_init() < 0) { + rte_eal_init_alert("failed to register mp callback for hotplug"); + return -1; + } + + if (rte_bus_scan()) { + rte_eal_init_alert("Cannot scan the buses for devices"); + rte_errno = ENODEV; + rte_atomic32_clear(&run_once); + return -1; + } + + phys_addrs = rte_eal_using_phys_addrs() != 0; + + /* if no EAL option "--iova-mode=", use bus IOVA scheme */ + if (internal_config.iova_mode == RTE_IOVA_DC) { + /* autodetect the IOVA mapping mode */ + enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); + + if (iova_mode == RTE_IOVA_DC) { + RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n"); + + if (!phys_addrs) { + /* if we have no access to physical addresses, + * pick IOVA as VA mode. + */ + iova_mode = RTE_IOVA_VA; + RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n"); +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) + } else if (rte_eal_check_module("rte_kni") == 1) { + iova_mode = RTE_IOVA_PA; + RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n"); +#endif + } else if (is_iommu_enabled()) { + /* we have an IOMMU, pick IOVA as VA mode */ + iova_mode = RTE_IOVA_VA; + RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n"); + } else { + /* physical addresses available, and no IOMMU + * found, so pick IOVA as PA. + */ + iova_mode = RTE_IOVA_PA; + RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n"); + } + } +#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) + /* Workaround for KNI which requires physical address to work + * in kernels < 4.10 + */ + if (iova_mode == RTE_IOVA_VA && + rte_eal_check_module("rte_kni") == 1) { + if (phys_addrs) { + iova_mode = RTE_IOVA_PA; + RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n"); + } else { + RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n"); + } + } +#endif + rte_eal_get_configuration()->iova_mode = iova_mode; + } else { + rte_eal_get_configuration()->iova_mode = + internal_config.iova_mode; + } + + if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) { + rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available"); + rte_errno = EINVAL; + return -1; + } + + RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", + rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); + + if (internal_config.no_hugetlbfs == 0) { + /* rte_config isn't initialized yet */ + ret = internal_config.process_type == RTE_PROC_PRIMARY ? + eal_hugepage_info_init() : + eal_hugepage_info_read(); + if (ret < 0) { + rte_eal_init_alert("Cannot get hugepage information."); + rte_errno = EACCES; + rte_atomic32_clear(&run_once); + return -1; + } + } + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { + if (internal_config.no_hugetlbfs) + internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; + } + + if (internal_config.vmware_tsc_map == 1) { +#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT + rte_cycles_vmware_tsc_map = 1; + RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " + "you must have monitor_control.pseudo_perfctr = TRUE\n"); +#else + RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " + "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); +#endif + } + + if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { + rte_eal_init_alert("Cannot init logging."); + rte_errno = ENOMEM; + rte_atomic32_clear(&run_once); + return -1; + } + +#ifdef VFIO_PRESENT + if (rte_eal_vfio_setup() < 0) { + rte_eal_init_alert("Cannot init VFIO"); + rte_errno = EAGAIN; + rte_atomic32_clear(&run_once); + return -1; + } +#endif + /* in secondary processes, memory init may allocate additional fbarrays + * not present in primary processes, so to avoid any potential issues, + * initialize memzones first. + */ + if (rte_eal_memzone_init() < 0) { + rte_eal_init_alert("Cannot init memzone"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_memory_init() < 0) { + rte_eal_init_alert("Cannot init memory"); + rte_errno = ENOMEM; + return -1; + } + + /* the directories are locked during eal_hugepage_info_init */ + eal_hugedirs_unlock(); + + if (rte_eal_malloc_heap_init() < 0) { + rte_eal_init_alert("Cannot init malloc heap"); + rte_errno = ENODEV; + return -1; + } + + if (rte_eal_tailqs_init() < 0) { + rte_eal_init_alert("Cannot init tail queues for objects"); + rte_errno = EFAULT; + return -1; + } + + if (rte_eal_timer_init() < 0) { + rte_eal_init_alert("Cannot init HPET or TSC timers"); + rte_errno = ENOTSUP; + return -1; + } + + eal_check_mem_on_local_socket(); + + eal_thread_init_master(rte_config.master_lcore); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + rte_config.master_lcore, (uintptr_t)thread_id, cpuset, + ret == 0 ? "" : "..."); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (pipe(lcore_config[i].pipe_master2slave) < 0) + rte_panic("Cannot create pipe\n"); + if (pipe(lcore_config[i].pipe_slave2master) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + ret = pthread_create(&lcore_config[i].thread_id, NULL, + eal_thread_loop, NULL); + if (ret != 0) + rte_panic("Cannot create thread\n"); + + /* Set thread_name for aid in debugging. */ + snprintf(thread_name, sizeof(thread_name), + "lcore-slave-%d", i); + ret = rte_thread_setname(lcore_config[i].thread_id, + thread_name); + if (ret != 0) + RTE_LOG(DEBUG, EAL, + "Cannot set name for lcore thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + + /* initialize services so vdevs register service during bus_probe. */ + ret = rte_service_init(); + if (ret) { + rte_eal_init_alert("rte_service_init() failed"); + rte_errno = ENOEXEC; + return -1; + } + + /* Probe all the buses and devices/drivers on them */ + if (rte_bus_probe()) { + rte_eal_init_alert("Cannot probe devices"); + rte_errno = ENOTSUP; + return -1; + } + +#ifdef VFIO_PRESENT + /* Register mp action after probe() so that we got enough info */ + if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) + return -1; +#endif + + /* initialize default service/lcore mappings and start running. Ignore + * -ENOTSUP, as it indicates no service coremask passed to EAL. + */ + ret = rte_service_start_with_defaults(); + if (ret < 0 && ret != -ENOTSUP) { + rte_errno = ENOEXEC; + return -1; + } + + /* + * Clean up unused files in runtime directory. We do this at the end of + * init and not at the beginning because we want to clean stuff up + * whether we are primary or secondary process, but we cannot remove + * primary process' files because secondary should be able to run even + * if primary process is dead. + * + * In no_shconf mode, no runtime directory is created in the first + * place, so no cleanup needed. + */ + if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { + rte_eal_init_alert("Cannot clear runtime directory\n"); + return -1; + } + + eal_mcfg_complete(); + + /* Call each registered callback, if enabled */ + rte_option_init(); + + return fctret; +} + +static int +mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg __rte_unused) +{ + /* ms is const, so find this memseg */ + struct rte_memseg *found; + + if (msl->external) + return 0; + + found = rte_mem_virt2memseg(ms->addr, msl); + + found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; + + return 0; +} + +int +rte_eal_cleanup(void) +{ + /* if we're in a primary process, we need to mark hugepages as freeable + * so that finalization can release them back to the system. + */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + rte_memseg_walk(mark_freeable, NULL); + rte_service_finalize(); + rte_mp_channel_cleanup(); + eal_cleanup_config(&internal_config); + return 0; +} + +enum rte_proc_type_t +rte_eal_process_type(void) +{ + return rte_config.process_type; +} + +int rte_eal_has_hugepages(void) +{ + return ! internal_config.no_hugetlbfs; +} + +int rte_eal_has_pci(void) +{ + return !internal_config.no_pci; +} + +int rte_eal_create_uio_dev(void) +{ + return internal_config.create_uio_dev; +} + +enum rte_intr_mode +rte_eal_vfio_intr_mode(void) +{ + return internal_config.vfio_intr_mode; +} + +int +rte_eal_check_module(const char *module_name) +{ + char sysfs_mod_name[PATH_MAX]; + struct stat st; + int n; + + if (NULL == module_name) + return -1; + + /* Check if there is sysfs mounted */ + if (stat("/sys/module", &st) != 0) { + RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + /* A module might be built-in, therefore try sysfs */ + n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); + if (n < 0 || n > PATH_MAX) { + RTE_LOG(DEBUG, EAL, "Could not format module path\n"); + return -1; + } + + if (stat(sysfs_mod_name, &st) != 0) { + RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", + sysfs_mod_name, errno, strerror(errno)); + return 0; + } + + /* Module has been found */ + return 1; +} diff --git a/lib/librte_eal/linux/eal/Makefile b/lib/librte_eal/linux/eal/Makefile deleted file mode 100644 index 692fec2695..0000000000 --- a/lib/librte_eal/linux/eal/Makefile +++ /dev/null @@ -1,101 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2010-2019 Intel Corporation - -include $(RTE_SDK)/mk/rte.vars.mk - -LIB = librte_eal.a - -ARCH_DIR ?= $(RTE_ARCH) - -EXPORT_MAP := ../../rte_eal_version.map -VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR) - -VPATH += $(RTE_SDK)/lib/librte_eal/common - -CFLAGS += -DALLOW_EXPERIMENTAL_API -CFLAGS += -I$(SRCDIR)/include -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common -CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include -CFLAGS += $(WERROR_FLAGS) -O3 - -LDLIBS += -ldl -LDLIBS += -lpthread -LDLIBS += -lgcc_s -LDLIBS += -lrt -LDLIBS += -lrte_kvargs -ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) -LDLIBS += -lnuma -endif - -# specific to linux exec-env -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) := eal.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_hugepage_info.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_log.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio_mp_sync.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_debug.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_interrupts.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_alarm.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_dev.c - -# from common dir -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_lcore.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_timer.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memzone.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_log.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_launch.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_mcfg.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memalloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memory.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_tailqs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_errno.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hypervisor.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_string_fns.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hexdump.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_devargs.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_class.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_bus.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_dev.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_options.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_thread.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_proc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_fbarray.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_uuid.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_malloc.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += hotplug_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_elem.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_heap.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_mp.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_keepalive.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_option.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_service.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_random.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c - -# from arch dir -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c -SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_hypervisor.c -SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c -SRCS-y += rte_cycles.c - -CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST) - -# workaround for a gcc bug with noreturn attribute -# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 -ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) -CFLAGS_eal_thread.o += -Wno-return-type -endif - -INC := rte_kni_common.h -INC += rte_os.h - -SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUX)-include := $(addprefix include/,$(INC)) - -include $(RTE_SDK)/mk/rte.lib.mk diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c deleted file mode 100644 index 9530ee55f8..0000000000 --- a/lib/librte_eal/linux/eal/eal.c +++ /dev/null @@ -1,1393 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation. - * Copyright(c) 2012-2014 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if defined(RTE_ARCH_X86) -#include -#endif -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_hugepages.h" -#include "eal_memcfg.h" -#include "eal_options.h" -#include "eal_vfio.h" -#include "hotplug_mp.h" - -#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) - -#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) - -#define KERNEL_IOMMU_GROUPS_PATH "/sys/kernel/iommu_groups" - -/* Allow the application to print its usage message too if set */ -static rte_usage_hook_t rte_application_usage_hook = NULL; - -/* early configuration structure, when memory config is not mmapped */ -static struct rte_mem_config early_mem_config; - -/* define fd variable here, because file needs to be kept open for the - * duration of the program, as we hold a write lock on it in the primary proc */ -static int mem_cfg_fd = -1; - -static struct flock wr_lock = { - .l_type = F_WRLCK, - .l_whence = SEEK_SET, - .l_start = offsetof(struct rte_mem_config, memsegs), - .l_len = sizeof(early_mem_config.memsegs), -}; - -/* Address of global and public configuration */ -static struct rte_config rte_config = { - .mem_config = &early_mem_config, -}; - -/* internal configuration (per-core) */ -struct lcore_config lcore_config[RTE_MAX_LCORE]; - -/* internal configuration */ -struct internal_config internal_config; - -/* used by rte_rdtsc() */ -int rte_cycles_vmware_tsc_map; - -/* platform-specific runtime dir */ -static char runtime_dir[PATH_MAX]; - -static const char *default_runtime_dir = "/var/run"; - -int -eal_create_runtime_dir(void) -{ - const char *directory = default_runtime_dir; - const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR"); - const char *fallback = "/tmp"; - char tmp[PATH_MAX]; - int ret; - - if (getuid() != 0) { - /* try XDG path first, fall back to /tmp */ - if (xdg_runtime_dir != NULL) - directory = xdg_runtime_dir; - else - directory = fallback; - } - /* create DPDK subdirectory under runtime dir */ - ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory); - if (ret < 0 || ret == sizeof(tmp)) { - RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n"); - return -1; - } - - /* create prefix-specific subdirectory under DPDK runtime dir */ - ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s", - tmp, eal_get_hugefile_prefix()); - if (ret < 0 || ret == sizeof(runtime_dir)) { - RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n"); - return -1; - } - - /* create the path if it doesn't exist. no "mkdir -p" here, so do it - * step by step. - */ - ret = mkdir(tmp, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - tmp, strerror(errno)); - return -1; - } - - ret = mkdir(runtime_dir, 0700); - if (ret < 0 && errno != EEXIST) { - RTE_LOG(ERR, EAL, "Error creating '%s': %s\n", - runtime_dir, strerror(errno)); - return -1; - } - - return 0; -} - -int -eal_clean_runtime_dir(void) -{ - DIR *dir; - struct dirent *dirent; - int dir_fd, fd, lck_result; - static const char * const filters[] = { - "fbarray_*", - "mp_socket_*" - }; - - /* open directory */ - dir = opendir(runtime_dir); - if (!dir) { - RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n", - runtime_dir); - goto error; - } - dir_fd = dirfd(dir); - - /* lock the directory before doing anything, to avoid races */ - if (flock(dir_fd, LOCK_EX) < 0) { - RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n", - runtime_dir); - goto error; - } - - dirent = readdir(dir); - if (!dirent) { - RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n", - runtime_dir); - goto error; - } - - while (dirent != NULL) { - unsigned int f_idx; - bool skip = true; - - /* skip files that don't match the patterns */ - for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) { - const char *filter = filters[f_idx]; - - if (fnmatch(filter, dirent->d_name, 0) == 0) { - skip = false; - break; - } - } - if (skip) { - dirent = readdir(dir); - continue; - } - - /* try and lock the file */ - fd = openat(dir_fd, dirent->d_name, O_RDONLY); - - /* skip to next file */ - if (fd == -1) { - dirent = readdir(dir); - continue; - } - - /* non-blocking lock */ - lck_result = flock(fd, LOCK_EX | LOCK_NB); - - /* if lock succeeds, remove the file */ - if (lck_result != -1) - unlinkat(dir_fd, dirent->d_name, 0); - close(fd); - dirent = readdir(dir); - } - - /* closedir closes dir_fd and drops the lock */ - closedir(dir); - return 0; - -error: - if (dir) - closedir(dir); - - RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n", - strerror(errno)); - - return -1; -} - -const char * -rte_eal_get_runtime_dir(void) -{ - return runtime_dir; -} - -/* Return user provided mbuf pool ops name */ -const char * -rte_eal_mbuf_user_pool_ops(void) -{ - return internal_config.user_mbuf_pool_ops_name; -} - -/* Return a pointer to the configuration structure */ -struct rte_config * -rte_eal_get_configuration(void) -{ - return &rte_config; -} - -enum rte_iova_mode -rte_eal_iova_mode(void) -{ - return rte_eal_get_configuration()->iova_mode; -} - -/* parse a sysfs (or other) file containing one integer value */ -int -eal_parse_sysfs_value(const char *filename, unsigned long *val) -{ - FILE *f; - char buf[BUFSIZ]; - char *end = NULL; - - if ((f = fopen(filename, "r")) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n", - __func__, filename); - return -1; - } - - if (fgets(buf, sizeof(buf), f) == NULL) { - RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - *val = strtoul(buf, &end, 0); - if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) { - RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n", - __func__, filename); - fclose(f); - return -1; - } - fclose(f); - return 0; -} - - -/* create memory configuration in shared/mmap memory. Take out - * a write lock on the memsegs, so we can auto-detect primary/secondary. - * This means we never close the file while running (auto-close on exit). - * We also don't lock the whole file, so that in future we can use read-locks - * on other parts, e.g. memzones, to detect if there are running secondary - * processes. */ -static int -rte_eal_config_create(void) -{ - size_t page_sz = sysconf(_SC_PAGE_SIZE); - size_t cfg_len = sizeof(*rte_config.mem_config); - size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz); - void *rte_mem_cfg_addr, *mapped_mem_cfg_addr; - int retval; - - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return 0; - - /* map the config before hugepage address so that we don't waste a page */ - if (internal_config.base_virtaddr != 0) - rte_mem_cfg_addr = (void *) - RTE_ALIGN_FLOOR(internal_config.base_virtaddr - - sizeof(struct rte_mem_config), page_sz); - else - rte_mem_cfg_addr = NULL; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600); - if (mem_cfg_fd < 0) { - RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", - pathname); - return -1; - } - } - - retval = ftruncate(mem_cfg_fd, cfg_len); - if (retval < 0){ - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n", - pathname); - return -1; - } - - retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock); - if (retval < 0){ - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary " - "process running?\n", pathname); - return -1; - } - - /* reserve space for config */ - rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr, - &cfg_len_aligned, page_sz, 0, 0); - if (rte_mem_cfg_addr == NULL) { - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n"); - close(mem_cfg_fd); - mem_cfg_fd = -1; - return -1; - } - - /* remap the actual file into the space we've just reserved */ - mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr, - cfg_len_aligned, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0); - if (mapped_mem_cfg_addr == MAP_FAILED) { - munmap(rte_mem_cfg_addr, cfg_len); - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n"); - return -1; - } - - memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config)); - rte_config.mem_config = rte_mem_cfg_addr; - - /* store address of the config in the config itself so that secondary - * processes could later map the config into this exact location */ - rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr; - - rte_config.mem_config->dma_maskbits = 0; - - return 0; -} - -/* attach to an existing shared memory config */ -static int -rte_eal_config_attach(void) -{ - struct rte_mem_config *mem_config; - - const char *pathname = eal_runtime_config_path(); - - if (internal_config.no_shconf) - return 0; - - if (mem_cfg_fd < 0){ - mem_cfg_fd = open(pathname, O_RDWR); - if (mem_cfg_fd < 0) { - RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n", - pathname); - return -1; - } - } - - /* map it as read-only first */ - mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config), - PROT_READ, MAP_SHARED, mem_cfg_fd, 0); - if (mem_config == MAP_FAILED) { - close(mem_cfg_fd); - mem_cfg_fd = -1; - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - rte_config.mem_config = mem_config; - - return 0; -} - -/* reattach the shared config at exact memory location primary process has it */ -static int -rte_eal_config_reattach(void) -{ - struct rte_mem_config *mem_config; - void *rte_mem_cfg_addr; - - if (internal_config.no_shconf) - return 0; - - /* save the address primary process has mapped shared config to */ - rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr; - - /* unmap original config */ - munmap(rte_config.mem_config, sizeof(struct rte_mem_config)); - - /* remap the config at proper address */ - mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr, - sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED, - mem_cfg_fd, 0); - - close(mem_cfg_fd); - mem_cfg_fd = -1; - - if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) { - if (mem_config != MAP_FAILED) { - /* errno is stale, don't use */ - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]" - " - please use '--" OPT_BASE_VIRTADDR - "' option\n", rte_mem_cfg_addr, mem_config); - munmap(mem_config, sizeof(struct rte_mem_config)); - return -1; - } - RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - rte_config.mem_config = mem_config; - - return 0; -} - -/* Detect if we are a primary or a secondary process */ -enum rte_proc_type_t -eal_proc_type_detect(void) -{ - enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; - const char *pathname = eal_runtime_config_path(); - - /* if there no shared config, there can be no secondary processes */ - if (!internal_config.no_shconf) { - /* if we can open the file but not get a write-lock we are a - * secondary process. NOTE: if we get a file handle back, we - * keep that open and don't close it to prevent a race condition - * between multiple opens. - */ - if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) && - (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0)) - ptype = RTE_PROC_SECONDARY; - } - - RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", - ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); - - return ptype; -} - -/* Sets up rte_config structure with the pointer to shared memory config.*/ -static int -rte_config_init(void) -{ - rte_config.process_type = internal_config.process_type; - - switch (rte_config.process_type){ - case RTE_PROC_PRIMARY: - if (rte_eal_config_create() < 0) - return -1; - eal_mcfg_update_from_internal(); - break; - case RTE_PROC_SECONDARY: - if (rte_eal_config_attach() < 0) - return -1; - eal_mcfg_wait_complete(); - if (eal_mcfg_check_version() < 0) { - RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n"); - return -1; - } - if (rte_eal_config_reattach() < 0) - return -1; - eal_mcfg_update_internal(); - break; - case RTE_PROC_AUTO: - case RTE_PROC_INVALID: - RTE_LOG(ERR, EAL, "Invalid process type %d\n", - rte_config.process_type); - return -1; - } - - return 0; -} - -/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */ -static void -eal_hugedirs_unlock(void) -{ - int i; - - for (i = 0; i < MAX_HUGEPAGE_SIZES; i++) - { - /* skip uninitialized */ - if (internal_config.hugepage_info[i].lock_descriptor < 0) - continue; - /* unlock hugepage file */ - flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN); - close(internal_config.hugepage_info[i].lock_descriptor); - /* reset the field */ - internal_config.hugepage_info[i].lock_descriptor = -1; - } -} - -/* display usage */ -static void -eal_usage(const char *prgname) -{ - printf("\nUsage: %s ", prgname); - eal_common_usage(); - printf("EAL Linux options:\n" - " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n" - " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n" - " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n" - " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n" - " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n" - " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n" - " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n" - " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n" - " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n" - "\n"); - /* Allow the application to print its usage message too if hook is set */ - if ( rte_application_usage_hook ) { - printf("===== Application Usage =====\n\n"); - rte_application_usage_hook(prgname); - } -} - -/* Set a per-application usage message */ -rte_usage_hook_t -rte_set_application_usage_hook( rte_usage_hook_t usage_func ) -{ - rte_usage_hook_t old_func; - - /* Will be NULL on the first call to denote the last usage routine. */ - old_func = rte_application_usage_hook; - rte_application_usage_hook = usage_func; - - return old_func; -} - -static int -eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg) -{ - char * arg[RTE_MAX_NUMA_NODES]; - char *end; - int arg_num, i, len; - uint64_t total_mem = 0; - - len = strnlen(strval, SOCKET_MEM_STRLEN); - if (len == SOCKET_MEM_STRLEN) { - RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); - return -1; - } - - /* all other error cases will be caught later */ - if (!isdigit(strval[len-1])) - return -1; - - /* split the optarg into separate socket values */ - arg_num = rte_strsplit(strval, len, - arg, RTE_MAX_NUMA_NODES, ','); - - /* if split failed, or 0 arguments */ - if (arg_num <= 0) - return -1; - - /* parse each defined socket option */ - errno = 0; - for (i = 0; i < arg_num; i++) { - uint64_t val; - end = NULL; - val = strtoull(arg[i], &end, 10); - - /* check for invalid input */ - if ((errno != 0) || - (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) - return -1; - val <<= 20; - total_mem += val; - socket_arg[i] = val; - } - - return 0; -} - -static int -eal_parse_vfio_intr(const char *mode) -{ - unsigned i; - static struct { - const char *name; - enum rte_intr_mode value; - } map[] = { - { "legacy", RTE_INTR_MODE_LEGACY }, - { "msi", RTE_INTR_MODE_MSI }, - { "msix", RTE_INTR_MODE_MSIX }, - }; - - for (i = 0; i < RTE_DIM(map); i++) { - if (!strcmp(mode, map[i].name)) { - internal_config.vfio_intr_mode = map[i].value; - return 0; - } - } - return -1; -} - -/* Parse the arguments for --log-level only */ -static void -eal_log_level_parse(int argc, char **argv) -{ - int opt; - char **argvopt; - int option_index; - const int old_optind = optind; - const int old_optopt = optopt; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - int ret; - - /* getopt is not happy, stop right now */ - if (opt == '?') - break; - - ret = (opt == OPT_LOG_LEVEL_NUM) ? - eal_parse_common_option(opt, optarg, &internal_config) : 0; - - /* common parser is not happy */ - if (ret < 0) - break; - } - - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optarg = old_optarg; -} - -/* Parse the argument given in the command line of the application */ -static int -eal_parse_args(int argc, char **argv) -{ - int opt, ret; - char **argvopt; - int option_index; - char *prgname = argv[0]; - const int old_optind = optind; - const int old_optopt = optopt; - char * const old_optarg = optarg; - - argvopt = argv; - optind = 1; - opterr = 0; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - /* - * getopt didn't recognise the option, lets parse the - * registered options to see if the flag is valid - */ - if (opt == '?') { - ret = rte_option_parse(argv[optind-1]); - if (ret == 0) - continue; - - eal_usage(prgname); - ret = -1; - goto out; - } - - ret = eal_parse_common_option(opt, optarg, &internal_config); - /* common parser is not happy */ - if (ret < 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - /* common parser handled this option */ - if (ret == 0) - continue; - - switch (opt) { - case 'h': - eal_usage(prgname); - exit(EXIT_SUCCESS); - - case OPT_HUGE_DIR_NUM: - { - char *hdir = strdup(optarg); - if (hdir == NULL) - RTE_LOG(ERR, EAL, "Could not store hugepage directory\n"); - else { - /* free old hugepage dir */ - if (internal_config.hugepage_dir != NULL) - free(internal_config.hugepage_dir); - internal_config.hugepage_dir = hdir; - } - break; - } - case OPT_FILE_PREFIX_NUM: - { - char *prefix = strdup(optarg); - if (prefix == NULL) - RTE_LOG(ERR, EAL, "Could not store file prefix\n"); - else { - /* free old prefix */ - if (internal_config.hugefile_prefix != NULL) - free(internal_config.hugefile_prefix); - internal_config.hugefile_prefix = prefix; - } - break; - } - case OPT_SOCKET_MEM_NUM: - if (eal_parse_socket_arg(optarg, - internal_config.socket_mem) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_SOCKET_MEM "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - internal_config.force_sockets = 1; - break; - - case OPT_SOCKET_LIMIT_NUM: - if (eal_parse_socket_arg(optarg, - internal_config.socket_limit) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_SOCKET_LIMIT "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - internal_config.force_socket_limits = 1; - break; - - case OPT_VFIO_INTR_NUM: - if (eal_parse_vfio_intr(optarg) < 0) { - RTE_LOG(ERR, EAL, "invalid parameters for --" - OPT_VFIO_INTR "\n"); - eal_usage(prgname); - ret = -1; - goto out; - } - break; - - case OPT_CREATE_UIO_DEV_NUM: - internal_config.create_uio_dev = 1; - break; - - case OPT_MBUF_POOL_OPS_NAME_NUM: - { - char *ops_name = strdup(optarg); - if (ops_name == NULL) - RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n"); - else { - /* free old ops name */ - if (internal_config.user_mbuf_pool_ops_name != - NULL) - free(internal_config.user_mbuf_pool_ops_name); - - internal_config.user_mbuf_pool_ops_name = - ops_name; - } - break; - } - case OPT_MATCH_ALLOCATIONS_NUM: - internal_config.match_allocations = 1; - break; - - default: - if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { - RTE_LOG(ERR, EAL, "Option %c is not supported " - "on Linux\n", opt); - } else if (opt >= OPT_LONG_MIN_NUM && - opt < OPT_LONG_MAX_NUM) { - RTE_LOG(ERR, EAL, "Option %s is not supported " - "on Linux\n", - eal_long_options[option_index].name); - } else { - RTE_LOG(ERR, EAL, "Option %d is not supported " - "on Linux\n", opt); - } - eal_usage(prgname); - ret = -1; - goto out; - } - } - - /* create runtime data directory */ - if (internal_config.no_shconf == 0 && - eal_create_runtime_dir() < 0) { - RTE_LOG(ERR, EAL, "Cannot create runtime directory\n"); - ret = -1; - goto out; - } - - if (eal_adjust_config(&internal_config) != 0) { - ret = -1; - goto out; - } - - /* sanity checks */ - if (eal_check_common_options(&internal_config) != 0) { - eal_usage(prgname); - ret = -1; - goto out; - } - - if (optind >= 0) - argv[optind-1] = prgname; - ret = optind-1; - -out: - /* restore getopt lib */ - optind = old_optind; - optopt = old_optopt; - optarg = old_optarg; - - return ret; -} - -static int -check_socket(const struct rte_memseg_list *msl, void *arg) -{ - int *socket_id = arg; - - if (msl->external) - return 0; - - return *socket_id == msl->socket_id; -} - -static void -eal_check_mem_on_local_socket(void) -{ - int socket_id; - - socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); - - if (rte_memseg_list_walk(check_socket, &socket_id) == 0) - RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n"); -} - -static int -sync_func(__attribute__((unused)) void *arg) -{ - return 0; -} - -/* - * Request iopl privilege for all RPL, returns 0 on success - * iopl() call is mostly for the i386 architecture. For other architectures, - * return -1 to indicate IO privilege can't be changed in this way. - */ -int -rte_eal_iopl_init(void) -{ -#if defined(RTE_ARCH_X86) - if (iopl(3) != 0) - return -1; -#endif - return 0; -} - -#ifdef VFIO_PRESENT -static int rte_eal_vfio_setup(void) -{ - if (rte_vfio_enable("vfio")) - return -1; - - return 0; -} -#endif - -static void rte_eal_init_alert(const char *msg) -{ - fprintf(stderr, "EAL: FATAL: %s\n", msg); - RTE_LOG(ERR, EAL, "%s\n", msg); -} - -/* - * On Linux 3.6+, even if VFIO is not loaded, whenever IOMMU is enabled in the - * BIOS and in the kernel, /sys/kernel/iommu_groups path will contain kernel - * IOMMU groups. If IOMMU is not enabled, that path would be empty. - * Therefore, checking if the path is empty will tell us if IOMMU is enabled. - */ -static bool -is_iommu_enabled(void) -{ - DIR *dir = opendir(KERNEL_IOMMU_GROUPS_PATH); - struct dirent *d; - int n = 0; - - /* if directory doesn't exist, assume IOMMU is not enabled */ - if (dir == NULL) - return false; - - while ((d = readdir(dir)) != NULL) { - /* skip dot and dot-dot */ - if (++n > 2) - break; - } - closedir(dir); - - return n > 2; -} - -/* Launch threads, called at application init(). */ -int -rte_eal_init(int argc, char **argv) -{ - int i, fctret, ret; - pthread_t thread_id; - static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0); - const char *p; - static char logid[PATH_MAX]; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - char thread_name[RTE_MAX_THREAD_NAME_LEN]; - bool phys_addrs; - - /* checks if the machine is adequate */ - if (!rte_cpu_is_supported()) { - rte_eal_init_alert("unsupported cpu type."); - rte_errno = ENOTSUP; - return -1; - } - - if (!rte_atomic32_test_and_set(&run_once)) { - rte_eal_init_alert("already called initialization."); - rte_errno = EALREADY; - return -1; - } - - p = strrchr(argv[0], '/'); - strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid)); - thread_id = pthread_self(); - - eal_reset_internal_config(&internal_config); - - /* set log level as early as possible */ - eal_log_level_parse(argc, argv); - - if (rte_eal_cpu_init() < 0) { - rte_eal_init_alert("Cannot detect lcores."); - rte_errno = ENOTSUP; - return -1; - } - - fctret = eal_parse_args(argc, argv); - if (fctret < 0) { - rte_eal_init_alert("Invalid 'command line' arguments."); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - if (eal_plugins_init() < 0) { - rte_eal_init_alert("Cannot init plugins"); - rte_errno = EINVAL; - rte_atomic32_clear(&run_once); - return -1; - } - - if (eal_option_device_parse()) { - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - if (rte_config_init() < 0) { - rte_eal_init_alert("Cannot init config"); - return -1; - } - - if (rte_eal_intr_init() < 0) { - rte_eal_init_alert("Cannot init interrupt-handling thread"); - return -1; - } - - if (rte_eal_alarm_init() < 0) { - rte_eal_init_alert("Cannot init alarm"); - /* rte_eal_alarm_init sets rte_errno on failure. */ - return -1; - } - - /* Put mp channel init before bus scan so that we can init the vdev - * bus through mp channel in the secondary process before the bus scan. - */ - if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) { - rte_eal_init_alert("failed to init mp channel"); - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - rte_errno = EFAULT; - return -1; - } - } - - /* register multi-process action callbacks for hotplug */ - if (eal_mp_dev_hotplug_init() < 0) { - rte_eal_init_alert("failed to register mp callback for hotplug"); - return -1; - } - - if (rte_bus_scan()) { - rte_eal_init_alert("Cannot scan the buses for devices"); - rte_errno = ENODEV; - rte_atomic32_clear(&run_once); - return -1; - } - - phys_addrs = rte_eal_using_phys_addrs() != 0; - - /* if no EAL option "--iova-mode=", use bus IOVA scheme */ - if (internal_config.iova_mode == RTE_IOVA_DC) { - /* autodetect the IOVA mapping mode */ - enum rte_iova_mode iova_mode = rte_bus_get_iommu_class(); - - if (iova_mode == RTE_IOVA_DC) { - RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n"); - - if (!phys_addrs) { - /* if we have no access to physical addresses, - * pick IOVA as VA mode. - */ - iova_mode = RTE_IOVA_VA; - RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n"); -#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) - } else if (rte_eal_check_module("rte_kni") == 1) { - iova_mode = RTE_IOVA_PA; - RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n"); -#endif - } else if (is_iommu_enabled()) { - /* we have an IOMMU, pick IOVA as VA mode */ - iova_mode = RTE_IOVA_VA; - RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n"); - } else { - /* physical addresses available, and no IOMMU - * found, so pick IOVA as PA. - */ - iova_mode = RTE_IOVA_PA; - RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n"); - } - } -#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0) - /* Workaround for KNI which requires physical address to work - * in kernels < 4.10 - */ - if (iova_mode == RTE_IOVA_VA && - rte_eal_check_module("rte_kni") == 1) { - if (phys_addrs) { - iova_mode = RTE_IOVA_PA; - RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n"); - } else { - RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n"); - } - } -#endif - rte_eal_get_configuration()->iova_mode = iova_mode; - } else { - rte_eal_get_configuration()->iova_mode = - internal_config.iova_mode; - } - - if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) { - rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available"); - rte_errno = EINVAL; - return -1; - } - - RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n", - rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA"); - - if (internal_config.no_hugetlbfs == 0) { - /* rte_config isn't initialized yet */ - ret = internal_config.process_type == RTE_PROC_PRIMARY ? - eal_hugepage_info_init() : - eal_hugepage_info_read(); - if (ret < 0) { - rte_eal_init_alert("Cannot get hugepage information."); - rte_errno = EACCES; - rte_atomic32_clear(&run_once); - return -1; - } - } - - if (internal_config.memory == 0 && internal_config.force_sockets == 0) { - if (internal_config.no_hugetlbfs) - internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; - } - - if (internal_config.vmware_tsc_map == 1) { -#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT - rte_cycles_vmware_tsc_map = 1; - RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, " - "you must have monitor_control.pseudo_perfctr = TRUE\n"); -#else - RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because " - "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n"); -#endif - } - - if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) { - rte_eal_init_alert("Cannot init logging."); - rte_errno = ENOMEM; - rte_atomic32_clear(&run_once); - return -1; - } - -#ifdef VFIO_PRESENT - if (rte_eal_vfio_setup() < 0) { - rte_eal_init_alert("Cannot init VFIO"); - rte_errno = EAGAIN; - rte_atomic32_clear(&run_once); - return -1; - } -#endif - /* in secondary processes, memory init may allocate additional fbarrays - * not present in primary processes, so to avoid any potential issues, - * initialize memzones first. - */ - if (rte_eal_memzone_init() < 0) { - rte_eal_init_alert("Cannot init memzone"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_memory_init() < 0) { - rte_eal_init_alert("Cannot init memory"); - rte_errno = ENOMEM; - return -1; - } - - /* the directories are locked during eal_hugepage_info_init */ - eal_hugedirs_unlock(); - - if (rte_eal_malloc_heap_init() < 0) { - rte_eal_init_alert("Cannot init malloc heap"); - rte_errno = ENODEV; - return -1; - } - - if (rte_eal_tailqs_init() < 0) { - rte_eal_init_alert("Cannot init tail queues for objects"); - rte_errno = EFAULT; - return -1; - } - - if (rte_eal_timer_init() < 0) { - rte_eal_init_alert("Cannot init HPET or TSC timers"); - rte_errno = ENOTSUP; - return -1; - } - - eal_check_mem_on_local_socket(); - - eal_thread_init_master(rte_config.master_lcore); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", - rte_config.master_lcore, (uintptr_t)thread_id, cpuset, - ret == 0 ? "" : "..."); - - RTE_LCORE_FOREACH_SLAVE(i) { - - /* - * create communication pipes between master thread - * and children - */ - if (pipe(lcore_config[i].pipe_master2slave) < 0) - rte_panic("Cannot create pipe\n"); - if (pipe(lcore_config[i].pipe_slave2master) < 0) - rte_panic("Cannot create pipe\n"); - - lcore_config[i].state = WAIT; - - /* create a thread for each lcore */ - ret = pthread_create(&lcore_config[i].thread_id, NULL, - eal_thread_loop, NULL); - if (ret != 0) - rte_panic("Cannot create thread\n"); - - /* Set thread_name for aid in debugging. */ - snprintf(thread_name, sizeof(thread_name), - "lcore-slave-%d", i); - ret = rte_thread_setname(lcore_config[i].thread_id, - thread_name); - if (ret != 0) - RTE_LOG(DEBUG, EAL, - "Cannot set name for lcore thread\n"); - } - - /* - * Launch a dummy function on all slave lcores, so that master lcore - * knows they are all ready when this function returns. - */ - rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); - rte_eal_mp_wait_lcore(); - - /* initialize services so vdevs register service during bus_probe. */ - ret = rte_service_init(); - if (ret) { - rte_eal_init_alert("rte_service_init() failed"); - rte_errno = ENOEXEC; - return -1; - } - - /* Probe all the buses and devices/drivers on them */ - if (rte_bus_probe()) { - rte_eal_init_alert("Cannot probe devices"); - rte_errno = ENOTSUP; - return -1; - } - -#ifdef VFIO_PRESENT - /* Register mp action after probe() so that we got enough info */ - if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0) - return -1; -#endif - - /* initialize default service/lcore mappings and start running. Ignore - * -ENOTSUP, as it indicates no service coremask passed to EAL. - */ - ret = rte_service_start_with_defaults(); - if (ret < 0 && ret != -ENOTSUP) { - rte_errno = ENOEXEC; - return -1; - } - - /* - * Clean up unused files in runtime directory. We do this at the end of - * init and not at the beginning because we want to clean stuff up - * whether we are primary or secondary process, but we cannot remove - * primary process' files because secondary should be able to run even - * if primary process is dead. - * - * In no_shconf mode, no runtime directory is created in the first - * place, so no cleanup needed. - */ - if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) { - rte_eal_init_alert("Cannot clear runtime directory\n"); - return -1; - } - - eal_mcfg_complete(); - - /* Call each registered callback, if enabled */ - rte_option_init(); - - return fctret; -} - -static int -mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - void *arg __rte_unused) -{ - /* ms is const, so find this memseg */ - struct rte_memseg *found; - - if (msl->external) - return 0; - - found = rte_mem_virt2memseg(ms->addr, msl); - - found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE; - - return 0; -} - -int -rte_eal_cleanup(void) -{ - /* if we're in a primary process, we need to mark hugepages as freeable - * so that finalization can release them back to the system. - */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - rte_memseg_walk(mark_freeable, NULL); - rte_service_finalize(); - rte_mp_channel_cleanup(); - eal_cleanup_config(&internal_config); - return 0; -} - -enum rte_proc_type_t -rte_eal_process_type(void) -{ - return rte_config.process_type; -} - -int rte_eal_has_hugepages(void) -{ - return ! internal_config.no_hugetlbfs; -} - -int rte_eal_has_pci(void) -{ - return !internal_config.no_pci; -} - -int rte_eal_create_uio_dev(void) -{ - return internal_config.create_uio_dev; -} - -enum rte_intr_mode -rte_eal_vfio_intr_mode(void) -{ - return internal_config.vfio_intr_mode; -} - -int -rte_eal_check_module(const char *module_name) -{ - char sysfs_mod_name[PATH_MAX]; - struct stat st; - int n; - - if (NULL == module_name) - return -1; - - /* Check if there is sysfs mounted */ - if (stat("/sys/module", &st) != 0) { - RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - /* A module might be built-in, therefore try sysfs */ - n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name); - if (n < 0 || n > PATH_MAX) { - RTE_LOG(DEBUG, EAL, "Could not format module path\n"); - return -1; - } - - if (stat(sysfs_mod_name, &st) != 0) { - RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n", - sysfs_mod_name, errno, strerror(errno)); - return 0; - } - - /* Module has been found */ - return 1; -} diff --git a/lib/librte_eal/linux/eal/eal_alarm.c b/lib/librte_eal/linux/eal/eal_alarm.c deleted file mode 100644 index 0924c9205c..0000000000 --- a/lib/librte_eal/linux/eal/eal_alarm.c +++ /dev/null @@ -1,244 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef TFD_NONBLOCK -#include -#define TFD_NONBLOCK O_NONBLOCK -#endif - -#define NS_PER_US 1000 -#define US_PER_MS 1000 -#define MS_PER_S 1000 -#ifndef US_PER_S -#define US_PER_S (US_PER_MS * MS_PER_S) -#endif - -#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ -#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW -#else -#define CLOCK_TYPE_ID CLOCK_MONOTONIC -#endif - -struct alarm_entry { - LIST_ENTRY(alarm_entry) next; - struct timeval time; - rte_eal_alarm_callback cb_fn; - void *cb_arg; - volatile uint8_t executing; - volatile pthread_t executing_id; -}; - -static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); -static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; - -static struct rte_intr_handle intr_handle = {.fd = -1 }; -static int handler_registered = 0; -static void eal_alarm_callback(void *arg); - -int -rte_eal_alarm_init(void) -{ - intr_handle.type = RTE_INTR_HANDLE_ALARM; - /* create a timerfd file descriptor */ - intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); - if (intr_handle.fd == -1) - goto error; - - return 0; - -error: - rte_errno = errno; - return -1; -} - -static void -eal_alarm_callback(void *arg __rte_unused) -{ - struct timespec now; - struct alarm_entry *ap; - - rte_spinlock_lock(&alarm_list_lk); - while ((ap = LIST_FIRST(&alarm_list)) !=NULL && - clock_gettime(CLOCK_TYPE_ID, &now) == 0 && - (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && - (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { - ap->executing = 1; - ap->executing_id = pthread_self(); - rte_spinlock_unlock(&alarm_list_lk); - - ap->cb_fn(ap->cb_arg); - - rte_spinlock_lock(&alarm_list_lk); - - LIST_REMOVE(ap, next); - free(ap); - } - - if (!LIST_EMPTY(&alarm_list)) { - struct itimerspec atime = { .it_interval = { 0, 0 } }; - - ap = LIST_FIRST(&alarm_list); - atime.it_value.tv_sec = ap->time.tv_sec; - atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; - /* perform borrow for subtraction if necessary */ - if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) - atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; - - atime.it_value.tv_sec -= now.tv_sec; - atime.it_value.tv_nsec -= now.tv_nsec; - timerfd_settime(intr_handle.fd, 0, &atime, NULL); - } - rte_spinlock_unlock(&alarm_list_lk); -} - -int -rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct timespec now; - int ret = 0; - struct alarm_entry *ap, *new_alarm; - - /* Check parameters, including that us won't cause a uint64_t overflow */ - if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) - return -EINVAL; - - new_alarm = calloc(1, sizeof(*new_alarm)); - if (new_alarm == NULL) - return -ENOMEM; - - /* use current time to calculate absolute time of alarm */ - clock_gettime(CLOCK_TYPE_ID, &now); - - new_alarm->cb_fn = cb_fn; - new_alarm->cb_arg = cb_arg; - new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; - new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); - - rte_spinlock_lock(&alarm_list_lk); - if (!handler_registered) { - /* registration can fail, callback can be registered later */ - if (rte_intr_callback_register(&intr_handle, - eal_alarm_callback, NULL) == 0) - handler_registered = 1; - } - - if (LIST_EMPTY(&alarm_list)) - LIST_INSERT_HEAD(&alarm_list, new_alarm, next); - else { - LIST_FOREACH(ap, &alarm_list, next) { - if (ap->time.tv_sec > new_alarm->time.tv_sec || - (ap->time.tv_sec == new_alarm->time.tv_sec && - ap->time.tv_usec > new_alarm->time.tv_usec)){ - LIST_INSERT_BEFORE(ap, new_alarm, next); - break; - } - if (LIST_NEXT(ap, next) == NULL) { - LIST_INSERT_AFTER(ap, new_alarm, next); - break; - } - } - } - - if (LIST_FIRST(&alarm_list) == new_alarm) { - struct itimerspec alarm_time = { - .it_interval = {0, 0}, - .it_value = { - .tv_sec = us / US_PER_S, - .tv_nsec = (us % US_PER_S) * NS_PER_US, - }, - }; - ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); - } - rte_spinlock_unlock(&alarm_list_lk); - - return ret; -} - -int -rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) -{ - struct alarm_entry *ap, *ap_prev; - int count = 0; - int err = 0; - int executing; - - if (!cb_fn) { - rte_errno = EINVAL; - return -1; - } - - do { - executing = 0; - rte_spinlock_lock(&alarm_list_lk); - /* remove any matches at the start of the list */ - while ((ap = LIST_FIRST(&alarm_list)) != NULL && - cb_fn == ap->cb_fn && - (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { - - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - } else { - /* If calling from other context, mark that alarm is executing - * so loop can spin till it finish. Otherwise we are trying to - * cancel our self - mark it by EINPROGRESS */ - if (pthread_equal(ap->executing_id, pthread_self()) == 0) - executing++; - else - err = EINPROGRESS; - - break; - } - } - ap_prev = ap; - - /* now go through list, removing entries not at start */ - LIST_FOREACH(ap, &alarm_list, next) { - /* this won't be true first time through */ - if (cb_fn == ap->cb_fn && - (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { - - if (ap->executing == 0) { - LIST_REMOVE(ap, next); - free(ap); - count++; - ap = ap_prev; - } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) - executing++; - else - err = EINPROGRESS; - } - ap_prev = ap; - } - rte_spinlock_unlock(&alarm_list_lk); - } while (executing != 0); - - if (count == 0 && err == 0) - rte_errno = ENOENT; - else if (err) - rte_errno = err; - - return count; -} diff --git a/lib/librte_eal/linux/eal/eal_cpuflags.c b/lib/librte_eal/linux/eal/eal_cpuflags.c deleted file mode 100644 index d38296e1e5..0000000000 --- a/lib/librte_eal/linux/eal/eal_cpuflags.c +++ /dev/null @@ -1,84 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright 2018 Red Hat, Inc. - */ - -#include -#include -#include -#include -#include -#include - -#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 16) -#include -#define HAS_AUXV 1 -#endif -#endif - -#include - -#ifndef HAS_AUXV -static unsigned long -getauxval(unsigned long type __rte_unused) -{ - errno = ENOTSUP; - return 0; -} -#endif - -#ifdef RTE_ARCH_64 -typedef Elf64_auxv_t Internal_Elfx_auxv_t; -#else -typedef Elf32_auxv_t Internal_Elfx_auxv_t; -#endif - -/** - * Provides a method for retrieving values from the auxiliary vector and - * possibly running a string comparison. - * - * @return Always returns a result. When the result is 0, check errno - * to see if an error occurred during processing. - */ -static unsigned long -_rte_cpu_getauxval(unsigned long type, const char *str) -{ - unsigned long val; - - errno = 0; - val = getauxval(type); - - if (!val && (errno == ENOTSUP || errno == ENOENT)) { - int auxv_fd = open("/proc/self/auxv", O_RDONLY); - Internal_Elfx_auxv_t auxv; - - if (auxv_fd == -1) - return 0; - - errno = ENOENT; - while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { - if (auxv.a_type == type) { - errno = 0; - val = auxv.a_un.a_val; - if (str) - val = strcmp((const char *)val, str); - break; - } - } - close(auxv_fd); - } - - return val; -} - -unsigned long -rte_cpu_getauxval(unsigned long type) -{ - return _rte_cpu_getauxval(type, NULL); -} - -int -rte_cpu_strcmp_auxval(unsigned long type, const char *str) -{ - return _rte_cpu_getauxval(type, str); -} diff --git a/lib/librte_eal/linux/eal/eal_debug.c b/lib/librte_eal/linux/eal/eal_debug.c deleted file mode 100644 index 5d92500bf5..0000000000 --- a/lib/librte_eal/linux/eal/eal_debug.c +++ /dev/null @@ -1,92 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#ifdef RTE_BACKTRACE -#include -#endif -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#define BACKTRACE_SIZE 256 - -/* dump the stack of the calling core */ -void rte_dump_stack(void) -{ -#ifdef RTE_BACKTRACE - void *func[BACKTRACE_SIZE]; - char **symb = NULL; - int size; - - size = backtrace(func, BACKTRACE_SIZE); - symb = backtrace_symbols(func, size); - - if (symb == NULL) - return; - - while (size > 0) { - rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, - "%d: [%s]\n", size, symb[size - 1]); - size --; - } - - free(symb); -#endif /* RTE_BACKTRACE */ -} - -/* not implemented in this environment */ -void rte_dump_registers(void) -{ - return; -} - -/* call abort(), it will generate a coredump if enabled */ -void __rte_panic(const char *funcname, const char *format, ...) -{ - va_list ap; - - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - rte_dump_stack(); - rte_dump_registers(); - abort(); -} - -/* - * Like rte_panic this terminates the application. However, no traceback is - * provided and no core-dump is generated. - */ -void -rte_exit(int exit_code, const char *format, ...) -{ - va_list ap; - - if (exit_code != 0) - RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" - " Cause: ", exit_code); - - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - -#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR - if (rte_eal_cleanup() != 0) - RTE_LOG(CRIT, EAL, - "EAL could not release all resources\n"); - exit(exit_code); -#else - rte_dump_stack(); - rte_dump_registers(); - abort(); -#endif -} diff --git a/lib/librte_eal/linux/eal/eal_dev.c b/lib/librte_eal/linux/eal/eal_dev.c deleted file mode 100644 index 83c9cd6607..0000000000 --- a/lib/librte_eal/linux/eal/eal_dev.c +++ /dev/null @@ -1,396 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2018 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" - -static struct rte_intr_handle intr_handle = {.fd = -1 }; -static bool monitor_started; -static bool hotplug_handle; - -#define EAL_UEV_MSG_LEN 4096 -#define EAL_UEV_MSG_ELEM_LEN 128 - -/* - * spinlock for device hot-unplug failure handling. If it try to access bus or - * device, such as handle sigbus on bus or handle memory failure for device - * just need to use this lock. It could protect the bus and the device to avoid - * race condition. - */ -static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; - -static struct sigaction sigbus_action_old; - -static int sigbus_need_recover; - -static void dev_uev_handler(__rte_unused void *param); - -/* identify the system layer which reports this event. */ -enum eal_dev_event_subsystem { - EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ - EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ - EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ - EAL_DEV_EVENT_SUBSYSTEM_MAX -}; - -static void -sigbus_action_recover(void) -{ - if (sigbus_need_recover) { - sigaction(SIGBUS, &sigbus_action_old, NULL); - sigbus_need_recover = 0; - } -} - -static void sigbus_handler(int signum, siginfo_t *info, - void *ctx __rte_unused) -{ - int ret; - - RTE_LOG(DEBUG, EAL, "Thread catch SIGBUS, fault address:%p\n", - info->si_addr); - - rte_spinlock_lock(&failure_handle_lock); - ret = rte_bus_sigbus_handler(info->si_addr); - rte_spinlock_unlock(&failure_handle_lock); - if (ret == -1) { - rte_exit(EXIT_FAILURE, - "Failed to handle SIGBUS for hot-unplug, " - "(rte_errno: %s)!", strerror(rte_errno)); - } else if (ret == 1) { - if (sigbus_action_old.sa_flags == SA_SIGINFO - && sigbus_action_old.sa_sigaction) { - (*(sigbus_action_old.sa_sigaction))(signum, - info, ctx); - } else if (sigbus_action_old.sa_flags != SA_SIGINFO - && sigbus_action_old.sa_handler) { - (*(sigbus_action_old.sa_handler))(signum); - } else { - rte_exit(EXIT_FAILURE, - "Failed to handle generic SIGBUS!"); - } - } - - RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); -} - -static int cmp_dev_name(const struct rte_device *dev, - const void *_name) -{ - const char *name = _name; - - return strcmp(dev->name, name); -} - -static int -dev_uev_socket_fd_create(void) -{ - struct sockaddr_nl addr; - int ret; - - intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | - SOCK_NONBLOCK, - NETLINK_KOBJECT_UEVENT); - if (intr_handle.fd < 0) { - RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); - return -1; - } - - memset(&addr, 0, sizeof(addr)); - addr.nl_family = AF_NETLINK; - addr.nl_pid = 0; - addr.nl_groups = 0xffffffff; - - ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); - goto err; - } - - return 0; -err: - close(intr_handle.fd); - intr_handle.fd = -1; - return ret; -} - -static int -dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) -{ - char action[EAL_UEV_MSG_ELEM_LEN]; - char subsystem[EAL_UEV_MSG_ELEM_LEN]; - char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; - int i = 0; - - memset(action, 0, EAL_UEV_MSG_ELEM_LEN); - memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); - memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); - - while (i < length) { - for (; i < length; i++) { - if (*buf) - break; - buf++; - } - /** - * check device uevent from kernel side, no need to check - * uevent from udev. - */ - if (!strncmp(buf, "libudev", 7)) { - buf += 7; - i += 7; - return -1; - } - if (!strncmp(buf, "ACTION=", 7)) { - buf += 7; - i += 7; - strlcpy(action, buf, sizeof(action)); - } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { - buf += 10; - i += 10; - strlcpy(subsystem, buf, sizeof(subsystem)); - } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { - buf += 14; - i += 14; - strlcpy(pci_slot_name, buf, sizeof(subsystem)); - event->devname = strdup(pci_slot_name); - } - for (; i < length; i++) { - if (*buf == '\0') - break; - buf++; - } - } - - /* parse the subsystem layer */ - if (!strncmp(subsystem, "uio", 3)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; - else if (!strncmp(subsystem, "pci", 3)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; - else if (!strncmp(subsystem, "vfio", 4)) - event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; - else - return -1; - - /* parse the action type */ - if (!strncmp(action, "add", 3)) - event->type = RTE_DEV_EVENT_ADD; - else if (!strncmp(action, "remove", 6)) - event->type = RTE_DEV_EVENT_REMOVE; - else - return -1; - return 0; -} - -static void -dev_delayed_unregister(void *param) -{ - rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); - close(intr_handle.fd); - intr_handle.fd = -1; -} - -static void -dev_uev_handler(__rte_unused void *param) -{ - struct rte_dev_event uevent; - int ret; - char buf[EAL_UEV_MSG_LEN]; - struct rte_bus *bus; - struct rte_device *dev; - const char *busname = ""; - - memset(&uevent, 0, sizeof(struct rte_dev_event)); - memset(buf, 0, EAL_UEV_MSG_LEN); - - ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); - if (ret < 0 && errno == EAGAIN) - return; - else if (ret <= 0) { - /* connection is closed or broken, can not up again. */ - RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); - rte_eal_alarm_set(1, dev_delayed_unregister, NULL); - return; - } - - ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); - if (ret < 0) { - RTE_LOG(DEBUG, EAL, "It is not an valid event " - "that need to be handle.\n"); - return; - } - - RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", - uevent.devname, uevent.type, uevent.subsystem); - - switch (uevent.subsystem) { - case EAL_DEV_EVENT_SUBSYSTEM_PCI: - case EAL_DEV_EVENT_SUBSYSTEM_UIO: - busname = "pci"; - break; - default: - break; - } - - if (uevent.devname) { - if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { - rte_spinlock_lock(&failure_handle_lock); - bus = rte_bus_find_by_name(busname); - if (bus == NULL) { - RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", - busname); - goto failure_handle_err; - } - - dev = bus->find_device(NULL, cmp_dev_name, - uevent.devname); - if (dev == NULL) { - RTE_LOG(ERR, EAL, "Cannot find device (%s) on " - "bus (%s)\n", uevent.devname, busname); - goto failure_handle_err; - } - - ret = bus->hot_unplug_handler(dev); - if (ret) { - RTE_LOG(ERR, EAL, "Can not handle hot-unplug " - "for device (%s)\n", dev->name); - } - rte_spinlock_unlock(&failure_handle_lock); - } - rte_dev_event_callback_process(uevent.devname, uevent.type); - } - - return; - -failure_handle_err: - rte_spinlock_unlock(&failure_handle_lock); -} - -int -rte_dev_event_monitor_start(void) -{ - int ret; - - if (monitor_started) - return 0; - - ret = dev_uev_socket_fd_create(); - if (ret) { - RTE_LOG(ERR, EAL, "error create device event fd.\n"); - return -1; - } - - intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; - ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); - - if (ret) { - RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); - return -1; - } - - monitor_started = true; - - return 0; -} - -int -rte_dev_event_monitor_stop(void) -{ - int ret; - - if (!monitor_started) - return 0; - - ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, - (void *)-1); - if (ret < 0) { - RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); - return ret; - } - - close(intr_handle.fd); - intr_handle.fd = -1; - monitor_started = false; - - return 0; -} - -int -dev_sigbus_handler_register(void) -{ - sigset_t mask; - struct sigaction action; - - rte_errno = 0; - - if (sigbus_need_recover) - return 0; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = SA_SIGINFO; - action.sa_mask = mask; - action.sa_sigaction = sigbus_handler; - sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); - - return rte_errno; -} - -int -dev_sigbus_handler_unregister(void) -{ - rte_errno = 0; - - sigbus_action_recover(); - - return rte_errno; -} - -int -rte_dev_hotplug_handle_enable(void) -{ - int ret = 0; - - ret = dev_sigbus_handler_register(); - if (ret < 0) - RTE_LOG(ERR, EAL, - "fail to register sigbus handler for devices.\n"); - - hotplug_handle = true; - - return ret; -} - -int -rte_dev_hotplug_handle_disable(void) -{ - int ret = 0; - - ret = dev_sigbus_handler_unregister(); - if (ret < 0) - RTE_LOG(ERR, EAL, - "fail to unregister sigbus handler for devices.\n"); - - hotplug_handle = false; - - return ret; -} diff --git a/lib/librte_eal/linux/eal/eal_hugepage_info.c b/lib/librte_eal/linux/eal/eal_hugepage_info.c deleted file mode 100644 index 91a4fede76..0000000000 --- a/lib/librte_eal/linux/eal/eal_hugepage_info.c +++ /dev/null @@ -1,547 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include /* for hugetlb-related flags */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "rte_string_fns.h" -#include "eal_internal_cfg.h" -#include "eal_hugepages.h" -#include "eal_filesystem.h" - -static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; -static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; - -/* - * Uses mmap to create a shared memory area for storage of data - * Used in this file to store the hugepage file map on disk - */ -static void * -map_shared_memory(const char *filename, const size_t mem_size, int flags) -{ - void *retval; - int fd = open(filename, flags, 0600); - if (fd < 0) - return NULL; - if (ftruncate(fd, mem_size) < 0) { - close(fd); - return NULL; - } - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - close(fd); - return retval; -} - -static void * -open_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR); -} - -static void * -create_shared_memory(const char *filename, const size_t mem_size) -{ - return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); -} - -static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val) -{ - char path[PATH_MAX]; - - snprintf(path, sizeof(path), "%s/%s/%s", - sys_dir_path, subdir, file); - return eal_parse_sysfs_value(path, val); -} - -/* this function is only called from eal_hugepage_info_init which itself - * is only called from a primary process */ -static uint32_t -get_num_hugepages(const char *subdir) -{ - unsigned long resv_pages, num_pages, over_pages, surplus_pages; - const char *nr_hp_file = "free_hugepages"; - const char *nr_rsvd_file = "resv_hugepages"; - const char *nr_over_file = "nr_overcommit_hugepages"; - const char *nr_splus_file = "surplus_hugepages"; - - /* first, check how many reserved pages kernel reports */ - if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0) - return 0; - - if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0) - return 0; - - if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0) - over_pages = 0; - - if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0) - surplus_pages = 0; - - /* adjust num_pages */ - if (num_pages >= resv_pages) - num_pages -= resv_pages; - else if (resv_pages) - num_pages = 0; - - if (over_pages >= surplus_pages) - over_pages -= surplus_pages; - else - over_pages = 0; - - if (num_pages == 0 && over_pages == 0) - RTE_LOG(WARNING, EAL, "No available hugepages reported in %s\n", - subdir); - - num_pages += over_pages; - if (num_pages < over_pages) /* overflow */ - num_pages = UINT32_MAX; - - /* we want to return a uint32_t and more than this looks suspicious - * anyway ... */ - if (num_pages > UINT32_MAX) - num_pages = UINT32_MAX; - - return num_pages; -} - -static uint32_t -get_num_hugepages_on_node(const char *subdir, unsigned int socket) -{ - char path[PATH_MAX], socketpath[PATH_MAX]; - DIR *socketdir; - unsigned long num_pages = 0; - const char *nr_hp_file = "free_hugepages"; - - snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", - sys_pages_numa_dir_path, socket); - - socketdir = opendir(socketpath); - if (socketdir) { - /* Keep calm and carry on */ - closedir(socketdir); - } else { - /* Can't find socket dir, so ignore it */ - return 0; - } - - snprintf(path, sizeof(path), "%s/%s/%s", - socketpath, subdir, nr_hp_file); - if (eal_parse_sysfs_value(path, &num_pages) < 0) - return 0; - - if (num_pages == 0) - RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", - subdir); - - /* - * we want to return a uint32_t and more than this looks suspicious - * anyway ... - */ - if (num_pages > UINT32_MAX) - num_pages = UINT32_MAX; - - return num_pages; -} - -static uint64_t -get_default_hp_size(void) -{ - const char proc_meminfo[] = "/proc/meminfo"; - const char str_hugepagesz[] = "Hugepagesize:"; - unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; - char buffer[256]; - unsigned long long size = 0; - - FILE *fd = fopen(proc_meminfo, "r"); - if (fd == NULL) - rte_panic("Cannot open %s\n", proc_meminfo); - while(fgets(buffer, sizeof(buffer), fd)){ - if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ - size = rte_str_to_size(&buffer[hugepagesz_len]); - break; - } - } - fclose(fd); - if (size == 0) - rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); - return size; -} - -static int -get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) -{ - enum proc_mount_fieldnames { - DEVICE = 0, - MOUNTPT, - FSTYPE, - OPTIONS, - _FIELDNAME_MAX - }; - static uint64_t default_size = 0; - const char proc_mounts[] = "/proc/mounts"; - const char hugetlbfs_str[] = "hugetlbfs"; - const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; - const char pagesize_opt[] = "pagesize="; - const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; - const char split_tok = ' '; - char *splitstr[_FIELDNAME_MAX]; - char buf[BUFSIZ]; - int retval = -1; - - FILE *fd = fopen(proc_mounts, "r"); - if (fd == NULL) - rte_panic("Cannot open %s\n", proc_mounts); - - if (default_size == 0) - default_size = get_default_hp_size(); - - while (fgets(buf, sizeof(buf), fd)){ - if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, - split_tok) != _FIELDNAME_MAX) { - RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); - break; /* return NULL */ - } - - /* we have a specified --huge-dir option, only examine that dir */ - if (internal_config.hugepage_dir != NULL && - strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) - continue; - - if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ - const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); - - /* if no explicit page size, the default page size is compared */ - if (pagesz_str == NULL){ - if (hugepage_sz == default_size){ - strlcpy(hugedir, splitstr[MOUNTPT], len); - retval = 0; - break; - } - } - /* there is an explicit page size, so check it */ - else { - uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); - if (pagesz == hugepage_sz) { - strlcpy(hugedir, splitstr[MOUNTPT], len); - retval = 0; - break; - } - } - } /* end if strncmp hugetlbfs */ - } /* end while fgets */ - - fclose(fd); - return retval; -} - -/* - * Clear the hugepage directory of whatever hugepage files - * there are. Checks if the file is locked (i.e. - * if it's in use by another DPDK process). - */ -static int -clear_hugedir(const char * hugedir) -{ - DIR *dir; - struct dirent *dirent; - int dir_fd, fd, lck_result; - const char filter[] = "*map_*"; /* matches hugepage files */ - - /* open directory */ - dir = opendir(hugedir); - if (!dir) { - RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", - hugedir); - goto error; - } - dir_fd = dirfd(dir); - - dirent = readdir(dir); - if (!dirent) { - RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", - hugedir); - goto error; - } - - while(dirent != NULL){ - /* skip files that don't match the hugepage pattern */ - if (fnmatch(filter, dirent->d_name, 0) > 0) { - dirent = readdir(dir); - continue; - } - - /* try and lock the file */ - fd = openat(dir_fd, dirent->d_name, O_RDONLY); - - /* skip to next file */ - if (fd == -1) { - dirent = readdir(dir); - continue; - } - - /* non-blocking lock */ - lck_result = flock(fd, LOCK_EX | LOCK_NB); - - /* if lock succeeds, remove the file */ - if (lck_result != -1) - unlinkat(dir_fd, dirent->d_name, 0); - close (fd); - dirent = readdir(dir); - } - - closedir(dir); - return 0; - -error: - if (dir) - closedir(dir); - - RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", - strerror(errno)); - - return -1; -} - -static int -compare_hpi(const void *a, const void *b) -{ - const struct hugepage_info *hpi_a = a; - const struct hugepage_info *hpi_b = b; - - return hpi_b->hugepage_sz - hpi_a->hugepage_sz; -} - -static void -calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) -{ - uint64_t total_pages = 0; - unsigned int i; - - /* - * first, try to put all hugepages into relevant sockets, but - * if first attempts fails, fall back to collecting all pages - * in one socket and sorting them later - */ - total_pages = 0; - /* we also don't want to do this for legacy init */ - if (!internal_config.legacy_mem) - for (i = 0; i < rte_socket_count(); i++) { - int socket = rte_socket_id_by_idx(i); - unsigned int num_pages = - get_num_hugepages_on_node( - dirent->d_name, socket); - hpi->num_pages[socket] = num_pages; - total_pages += num_pages; - } - /* - * we failed to sort memory from the get go, so fall - * back to old way - */ - if (total_pages == 0) { - hpi->num_pages[0] = get_num_hugepages(dirent->d_name); - -#ifndef RTE_ARCH_64 - /* for 32-bit systems, limit number of hugepages to - * 1GB per page size */ - hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], - RTE_PGSIZE_1G / hpi->hugepage_sz); -#endif - } -} - -static int -hugepage_info_init(void) -{ const char dirent_start_text[] = "hugepages-"; - const size_t dirent_start_len = sizeof(dirent_start_text) - 1; - unsigned int i, num_sizes = 0; - DIR *dir; - struct dirent *dirent; - - dir = opendir(sys_dir_path); - if (dir == NULL) { - RTE_LOG(ERR, EAL, - "Cannot open directory %s to read system hugepage info\n", - sys_dir_path); - return -1; - } - - for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { - struct hugepage_info *hpi; - - if (strncmp(dirent->d_name, dirent_start_text, - dirent_start_len) != 0) - continue; - - if (num_sizes >= MAX_HUGEPAGE_SIZES) - break; - - hpi = &internal_config.hugepage_info[num_sizes]; - hpi->hugepage_sz = - rte_str_to_size(&dirent->d_name[dirent_start_len]); - - /* first, check if we have a mountpoint */ - if (get_hugepage_dir(hpi->hugepage_sz, - hpi->hugedir, sizeof(hpi->hugedir)) < 0) { - uint32_t num_pages; - - num_pages = get_num_hugepages(dirent->d_name); - if (num_pages > 0) - RTE_LOG(NOTICE, EAL, - "%" PRIu32 " hugepages of size " - "%" PRIu64 " reserved, but no mounted " - "hugetlbfs found for that size\n", - num_pages, hpi->hugepage_sz); - /* if we have kernel support for reserving hugepages - * through mmap, and we're in in-memory mode, treat this - * page size as valid. we cannot be in legacy mode at - * this point because we've checked this earlier in the - * init process. - */ -#ifdef MAP_HUGE_SHIFT - if (internal_config.in_memory) { - RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " - "hugepages of size %" PRIu64 " bytes " - "will be allocated anonymously\n", - hpi->hugepage_sz); - calc_num_pages(hpi, dirent); - num_sizes++; - } -#endif - continue; - } - - /* try to obtain a writelock */ - hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); - - /* if blocking lock failed */ - if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { - RTE_LOG(CRIT, EAL, - "Failed to lock hugepage directory!\n"); - break; - } - /* clear out the hugepages dir from unused pages */ - if (clear_hugedir(hpi->hugedir) == -1) - break; - - calc_num_pages(hpi, dirent); - - num_sizes++; - } - closedir(dir); - - /* something went wrong, and we broke from the for loop above */ - if (dirent != NULL) - return -1; - - internal_config.num_hugepage_sizes = num_sizes; - - /* sort the page directory entries by size, largest to smallest */ - qsort(&internal_config.hugepage_info[0], num_sizes, - sizeof(internal_config.hugepage_info[0]), compare_hpi); - - /* now we have all info, check we have at least one valid size */ - for (i = 0; i < num_sizes; i++) { - /* pages may no longer all be on socket 0, so check all */ - unsigned int j, num_pages = 0; - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) - num_pages += hpi->num_pages[j]; - if (num_pages > 0) - return 0; - } - - /* no valid hugepage mounts available, return error */ - return -1; -} - -/* - * when we initialize the hugepage info, everything goes - * to socket 0 by default. it will later get sorted by memory - * initialization procedure. - */ -int -eal_hugepage_info_init(void) -{ - struct hugepage_info *hpi, *tmp_hpi; - unsigned int i; - - if (hugepage_info_init() < 0) - return -1; - - /* for no shared files mode, we're done */ - if (internal_config.no_shconf) - return 0; - - hpi = &internal_config.hugepage_info[0]; - - tmp_hpi = create_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL) { - RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); - return -1; - } - - memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); - - /* we've copied file descriptors along with everything else, but they - * will be invalid in secondary process, so overwrite them - */ - for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { - struct hugepage_info *tmp = &tmp_hpi[i]; - tmp->lock_descriptor = -1; - } - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - return 0; -} - -int eal_hugepage_info_read(void) -{ - struct hugepage_info *hpi = &internal_config.hugepage_info[0]; - struct hugepage_info *tmp_hpi; - - tmp_hpi = open_shared_memory(eal_hugepage_info_path(), - sizeof(internal_config.hugepage_info)); - if (tmp_hpi == NULL) { - RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); - return -1; - } - - memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); - - if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { - RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); - return -1; - } - return 0; -} diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c deleted file mode 100644 index cb8e107098..0000000000 --- a/lib/librte_eal/linux/eal/eal_interrupts.c +++ /dev/null @@ -1,1495 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_vfio.h" -#include "eal_thread.h" - -#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) -#define NB_OTHER_INTR 1 - -static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ - -/** - * union for pipe fds. - */ -union intr_pipefds{ - struct { - int pipefd[2]; - }; - struct { - int readfd; - int writefd; - }; -}; - -/** - * union buffer for reading on different devices - */ -union rte_intr_read_buffer { - int uio_intr_count; /* for uio device */ -#ifdef VFIO_PRESENT - uint64_t vfio_intr_count; /* for vfio device */ -#endif - uint64_t timerfd_num; /* for timerfd */ - char charbuf[16]; /* for others */ -}; - -TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); -TAILQ_HEAD(rte_intr_source_list, rte_intr_source); - -struct rte_intr_callback { - TAILQ_ENTRY(rte_intr_callback) next; - rte_intr_callback_fn cb_fn; /**< callback address */ - void *cb_arg; /**< parameter for callback */ - uint8_t pending_delete; /**< delete after callback is called */ - rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ -}; - -struct rte_intr_source { - TAILQ_ENTRY(rte_intr_source) next; - struct rte_intr_handle intr_handle; /**< interrupt handle */ - struct rte_intr_cb_list callbacks; /**< user callbacks */ - uint32_t active; -}; - -/* global spinlock for interrupt data operation */ -static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; - -/* union buffer for pipe read/write */ -static union intr_pipefds intr_pipe; - -/* interrupt sources list */ -static struct rte_intr_source_list intr_sources; - -/* interrupt handling thread */ -static pthread_t intr_thread; - -/* VFIO interrupts */ -#ifdef VFIO_PRESENT - -#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) -/* irq set buffer length for queue interrupts and LSC interrupt */ -#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ - sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) - -/* enable legacy (INTx) interrupts */ -static int -vfio_enable_intx(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - /* enable INTx */ - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - /* unmask INTx after enabling */ - memset(irq_set, 0, len); - len = sizeof(struct vfio_irq_set); - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - return 0; -} - -/* disable legacy (INTx) interrupts */ -static int -vfio_disable_intx(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - /* mask interrupts before disabling */ - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - /* disable INTx*/ - memset(irq_set, 0, len); - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, - "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); - return -1; - } - return 0; -} - -/* unmask/ack legacy (INTx) interrupts */ -static int -vfio_ack_intx(const struct rte_intr_handle *intr_handle) -{ - struct vfio_irq_set irq_set; - - /* unmask INTx */ - memset(&irq_set, 0, sizeof(irq_set)); - irq_set.argsz = sizeof(irq_set); - irq_set.count = 1; - irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; - irq_set.index = VFIO_PCI_INTX_IRQ_INDEX; - irq_set.start = 0; - - if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) { - RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - return 0; -} - -/* enable MSI interrupts */ -static int -vfio_enable_msi(const struct rte_intr_handle *intr_handle) { - int len, ret; - char irq_set_buf[IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - return 0; -} - -/* disable MSI interrupts */ -static int -vfio_disable_msi(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, - "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); - - return ret; -} - -/* enable MSI-X interrupts */ -static int -vfio_enable_msix(const struct rte_intr_handle *intr_handle) { - int len, ret; - char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ - irq_set->count = intr_handle->max_intr ? - (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? - RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - /* INTR vector offset 0 reserve for non-efds mapping */ - fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; - memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, - sizeof(*intr_handle->efds) * intr_handle->nb_efd); - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -/* disable MSI-X interrupts */ -static int -vfio_disable_msix(const struct rte_intr_handle *intr_handle) { - struct vfio_irq_set *irq_set; - char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, - "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); - - return ret; -} - -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE -/* enable req notifier */ -static int -vfio_enable_req(const struct rte_intr_handle *intr_handle) -{ - int len, ret; - char irq_set_buf[IRQ_SET_BUF_LEN]; - struct vfio_irq_set *irq_set; - int *fd_ptr; - - len = sizeof(irq_set_buf); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 1; - irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | - VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; - irq_set->start = 0; - fd_ptr = (int *) &irq_set->data; - *fd_ptr = intr_handle->fd; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) { - RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -/* disable req notifier */ -static int -vfio_disable_req(const struct rte_intr_handle *intr_handle) -{ - struct vfio_irq_set *irq_set; - char irq_set_buf[IRQ_SET_BUF_LEN]; - int len, ret; - - len = sizeof(struct vfio_irq_set); - - irq_set = (struct vfio_irq_set *) irq_set_buf; - irq_set->argsz = len; - irq_set->count = 0; - irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; - irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; - irq_set->start = 0; - - ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); - - if (ret) - RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", - intr_handle->fd); - - return ret; -} -#endif -#endif - -static int -uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) -{ - unsigned char command_high; - - /* use UIO config file descriptor for uio_pci_generic */ - if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error reading interrupts status for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - /* disable interrupts */ - command_high |= 0x4; - if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error disabling interrupts for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - - return 0; -} - -static int -uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) -{ - unsigned char command_high; - - /* use UIO config file descriptor for uio_pci_generic */ - if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error reading interrupts status for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - /* enable interrupts */ - command_high &= ~0x4; - if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { - RTE_LOG(ERR, EAL, - "Error enabling interrupts for fd %d\n", - intr_handle->uio_cfg_fd); - return -1; - } - - return 0; -} - -static int -uio_intr_disable(const struct rte_intr_handle *intr_handle) -{ - const int value = 0; - - if (write(intr_handle->fd, &value, sizeof(value)) < 0) { - RTE_LOG(ERR, EAL, - "Error disabling interrupts for fd %d (%s)\n", - intr_handle->fd, strerror(errno)); - return -1; - } - return 0; -} - -static int -uio_intr_enable(const struct rte_intr_handle *intr_handle) -{ - const int value = 1; - - if (write(intr_handle->fd, &value, sizeof(value)) < 0) { - RTE_LOG(ERR, EAL, - "Error enabling interrupts for fd %d (%s)\n", - intr_handle->fd, strerror(errno)); - return -1; - } - return 0; -} - -int -rte_intr_callback_register(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb, void *cb_arg) -{ - int ret, wake_thread; - struct rte_intr_source *src; - struct rte_intr_callback *callback; - - wake_thread = 0; - - /* first do parameter checking */ - if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { - RTE_LOG(ERR, EAL, - "Registering with invalid input parameter\n"); - return -EINVAL; - } - - /* allocate a new interrupt callback entity */ - callback = calloc(1, sizeof(*callback)); - if (callback == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - return -ENOMEM; - } - callback->cb_fn = cb; - callback->cb_arg = cb_arg; - callback->pending_delete = 0; - callback->ucb_fn = NULL; - - rte_spinlock_lock(&intr_lock); - - /* check if there is at least one callback registered for the fd */ - TAILQ_FOREACH(src, &intr_sources, next) { - if (src->intr_handle.fd == intr_handle->fd) { - /* we had no interrupts for this */ - if (TAILQ_EMPTY(&src->callbacks)) - wake_thread = 1; - - TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); - ret = 0; - break; - } - } - - /* no existing callbacks for this - add new source */ - if (src == NULL) { - src = calloc(1, sizeof(*src)); - if (src == NULL) { - RTE_LOG(ERR, EAL, "Can not allocate memory\n"); - free(callback); - ret = -ENOMEM; - } else { - src->intr_handle = *intr_handle; - TAILQ_INIT(&src->callbacks); - TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); - TAILQ_INSERT_TAIL(&intr_sources, src, next); - wake_thread = 1; - ret = 0; - } - } - - rte_spinlock_unlock(&intr_lock); - - /** - * check if need to notify the pipe fd waited by epoll_wait to - * rebuild the wait list. - */ - if (wake_thread) - if (write(intr_pipe.writefd, "1", 1) < 0) - return -EPIPE; - - return ret; -} - -int -rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb_fn, void *cb_arg, - rte_intr_unregister_callback_fn ucb_fn) -{ - int ret; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - - /* do parameter checking first */ - if (intr_handle == NULL || intr_handle->fd < 0) { - RTE_LOG(ERR, EAL, - "Unregistering with invalid input parameter\n"); - return -EINVAL; - } - - rte_spinlock_lock(&intr_lock); - - /* check if the insterrupt source for the fd is existent */ - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == intr_handle->fd) - break; - - /* No interrupt source registered for the fd */ - if (src == NULL) { - ret = -ENOENT; - - /* only usable if the source is active */ - } else if (src->active == 0) { - ret = -EAGAIN; - - } else { - ret = 0; - - /* walk through the callbacks and mark all that match. */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - next = TAILQ_NEXT(cb, next); - if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || - cb->cb_arg == cb_arg)) { - cb->pending_delete = 1; - cb->ucb_fn = ucb_fn; - ret++; - } - } - } - - rte_spinlock_unlock(&intr_lock); - - return ret; -} - -int -rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, - rte_intr_callback_fn cb_fn, void *cb_arg) -{ - int ret; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - - /* do parameter checking first */ - if (intr_handle == NULL || intr_handle->fd < 0) { - RTE_LOG(ERR, EAL, - "Unregistering with invalid input parameter\n"); - return -EINVAL; - } - - rte_spinlock_lock(&intr_lock); - - /* check if the insterrupt source for the fd is existent */ - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == intr_handle->fd) - break; - - /* No interrupt source registered for the fd */ - if (src == NULL) { - ret = -ENOENT; - - /* interrupt source has some active callbacks right now. */ - } else if (src->active != 0) { - ret = -EAGAIN; - - /* ok to remove. */ - } else { - ret = 0; - - /*walk through the callbacks and remove all that match. */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - - next = TAILQ_NEXT(cb, next); - - if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || - cb->cb_arg == cb_arg)) { - TAILQ_REMOVE(&src->callbacks, cb, next); - free(cb); - ret++; - } - } - - /* all callbacks for that source are removed. */ - if (TAILQ_EMPTY(&src->callbacks)) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - } - - rte_spinlock_unlock(&intr_lock); - - /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ - if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { - ret = -EPIPE; - } - - return ret; -} - -int -rte_intr_enable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type){ - /* write to the uio fd to enable the interrupt */ - case RTE_INTR_HANDLE_UIO: - if (uio_intr_enable(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_UIO_INTX: - if (uio_intx_intr_enable(intr_handle)) - return -1; - break; - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - if (vfio_enable_msix(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_MSI: - if (vfio_enable_msi(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_LEGACY: - if (vfio_enable_intx(intr_handle)) - return -1; - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - if (vfio_enable_req(intr_handle)) - return -1; - break; -#endif -#endif - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -/** - * PMD generally calls this function at the end of its IRQ callback. - * Internally, it unmasks the interrupt if possible. - * - * For INTx, unmasking is required as the interrupt is auto-masked prior to - * invoking callback. - * - * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not - * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI, - * this function is no-op. - */ -int -rte_intr_ack(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type) { - /* Both acking and enabling are same for UIO */ - case RTE_INTR_HANDLE_UIO: - if (uio_intr_enable(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_UIO_INTX: - if (uio_intx_intr_enable(intr_handle)) - return -1; - break; - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; -#ifdef VFIO_PRESENT - /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */ - case RTE_INTR_HANDLE_VFIO_MSIX: - case RTE_INTR_HANDLE_VFIO_MSI: - return 0; - case RTE_INTR_HANDLE_VFIO_LEGACY: - if (vfio_ack_intx(intr_handle)) - return -1; - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - return -1; -#endif -#endif - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -int -rte_intr_disable(const struct rte_intr_handle *intr_handle) -{ - if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 0; - - if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) - return -1; - - switch (intr_handle->type){ - /* write to the uio fd to disable the interrupt */ - case RTE_INTR_HANDLE_UIO: - if (uio_intr_disable(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_UIO_INTX: - if (uio_intx_intr_disable(intr_handle)) - return -1; - break; - /* not used at this moment */ - case RTE_INTR_HANDLE_ALARM: - return -1; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - if (vfio_disable_msix(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_MSI: - if (vfio_disable_msi(intr_handle)) - return -1; - break; - case RTE_INTR_HANDLE_VFIO_LEGACY: - if (vfio_disable_intx(intr_handle)) - return -1; - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - if (vfio_disable_req(intr_handle)) - return -1; - break; -#endif -#endif - /* not used at this moment */ - case RTE_INTR_HANDLE_DEV_EVENT: - return -1; - /* unknown handle type */ - default: - RTE_LOG(ERR, EAL, - "Unknown handle type of fd %d\n", - intr_handle->fd); - return -1; - } - - return 0; -} - -static int -eal_intr_process_interrupts(struct epoll_event *events, int nfds) -{ - bool call = false; - int n, bytes_read, rv; - struct rte_intr_source *src; - struct rte_intr_callback *cb, *next; - union rte_intr_read_buffer buf; - struct rte_intr_callback active_cb; - - for (n = 0; n < nfds; n++) { - - /** - * if the pipe fd is ready to read, return out to - * rebuild the wait list. - */ - if (events[n].data.fd == intr_pipe.readfd){ - int r = read(intr_pipe.readfd, buf.charbuf, - sizeof(buf.charbuf)); - RTE_SET_USED(r); - return -1; - } - rte_spinlock_lock(&intr_lock); - TAILQ_FOREACH(src, &intr_sources, next) - if (src->intr_handle.fd == - events[n].data.fd) - break; - if (src == NULL){ - rte_spinlock_unlock(&intr_lock); - continue; - } - - /* mark this interrupt source as active and release the lock. */ - src->active = 1; - rte_spinlock_unlock(&intr_lock); - - /* set the length to be read dor different handle type */ - switch (src->intr_handle.type) { - case RTE_INTR_HANDLE_UIO: - case RTE_INTR_HANDLE_UIO_INTX: - bytes_read = sizeof(buf.uio_intr_count); - break; - case RTE_INTR_HANDLE_ALARM: - bytes_read = sizeof(buf.timerfd_num); - break; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - case RTE_INTR_HANDLE_VFIO_MSI: - case RTE_INTR_HANDLE_VFIO_LEGACY: - bytes_read = sizeof(buf.vfio_intr_count); - break; -#ifdef HAVE_VFIO_DEV_REQ_INTERFACE - case RTE_INTR_HANDLE_VFIO_REQ: - bytes_read = 0; - call = true; - break; -#endif -#endif - case RTE_INTR_HANDLE_VDEV: - case RTE_INTR_HANDLE_EXT: - bytes_read = 0; - call = true; - break; - case RTE_INTR_HANDLE_DEV_EVENT: - bytes_read = 0; - call = true; - break; - default: - bytes_read = 1; - break; - } - - if (bytes_read > 0) { - /** - * read out to clear the ready-to-be-read flag - * for epoll_wait. - */ - bytes_read = read(events[n].data.fd, &buf, bytes_read); - if (bytes_read < 0) { - if (errno == EINTR || errno == EWOULDBLOCK) - continue; - - RTE_LOG(ERR, EAL, "Error reading from file " - "descriptor %d: %s\n", - events[n].data.fd, - strerror(errno)); - /* - * The device is unplugged or buggy, remove - * it as an interrupt source and return to - * force the wait list to be rebuilt. - */ - rte_spinlock_lock(&intr_lock); - TAILQ_REMOVE(&intr_sources, src, next); - rte_spinlock_unlock(&intr_lock); - - for (cb = TAILQ_FIRST(&src->callbacks); cb; - cb = next) { - next = TAILQ_NEXT(cb, next); - TAILQ_REMOVE(&src->callbacks, cb, next); - free(cb); - } - free(src); - return -1; - } else if (bytes_read == 0) - RTE_LOG(ERR, EAL, "Read nothing from file " - "descriptor %d\n", events[n].data.fd); - else - call = true; - } - - /* grab a lock, again to call callbacks and update status. */ - rte_spinlock_lock(&intr_lock); - - if (call) { - - /* Finally, call all callbacks. */ - TAILQ_FOREACH(cb, &src->callbacks, next) { - - /* make a copy and unlock. */ - active_cb = *cb; - rte_spinlock_unlock(&intr_lock); - - /* call the actual callback */ - active_cb.cb_fn(active_cb.cb_arg); - - /*get the lock back. */ - rte_spinlock_lock(&intr_lock); - } - } - /* we done with that interrupt source, release it. */ - src->active = 0; - - rv = 0; - - /* check if any callback are supposed to be removed */ - for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { - next = TAILQ_NEXT(cb, next); - if (cb->pending_delete) { - TAILQ_REMOVE(&src->callbacks, cb, next); - if (cb->ucb_fn) - cb->ucb_fn(&src->intr_handle, cb->cb_arg); - free(cb); - rv++; - } - } - - /* all callbacks for that source are removed. */ - if (TAILQ_EMPTY(&src->callbacks)) { - TAILQ_REMOVE(&intr_sources, src, next); - free(src); - } - - /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ - if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { - rte_spinlock_unlock(&intr_lock); - return -EPIPE; - } - - rte_spinlock_unlock(&intr_lock); - } - - return 0; -} - -/** - * It handles all the interrupts. - * - * @param pfd - * epoll file descriptor. - * @param totalfds - * The number of file descriptors added in epoll. - * - * @return - * void - */ -static void -eal_intr_handle_interrupts(int pfd, unsigned totalfds) -{ - struct epoll_event events[totalfds]; - int nfds = 0; - - for(;;) { - nfds = epoll_wait(pfd, events, totalfds, - EAL_INTR_EPOLL_WAIT_FOREVER); - /* epoll_wait fail */ - if (nfds < 0) { - if (errno == EINTR) - continue; - RTE_LOG(ERR, EAL, - "epoll_wait returns with fail\n"); - return; - } - /* epoll_wait timeout, will never happens here */ - else if (nfds == 0) - continue; - /* epoll_wait has at least one fd ready to read */ - if (eal_intr_process_interrupts(events, nfds) < 0) - return; - } -} - -/** - * It builds/rebuilds up the epoll file descriptor with all the - * file descriptors being waited on. Then handles the interrupts. - * - * @param arg - * pointer. (unused) - * - * @return - * never return; - */ -static __attribute__((noreturn)) void * -eal_intr_thread_main(__rte_unused void *arg) -{ - /* host thread, never break out */ - for (;;) { - /* build up the epoll fd with all descriptors we are to - * wait on then pass it to the handle_interrupts function - */ - static struct epoll_event pipe_event = { - .events = EPOLLIN | EPOLLPRI, - }; - struct rte_intr_source *src; - unsigned numfds = 0; - - /* create epoll fd */ - int pfd = epoll_create(1); - if (pfd < 0) - rte_panic("Cannot create epoll instance\n"); - - pipe_event.data.fd = intr_pipe.readfd; - /** - * add pipe fd into wait list, this pipe is used to - * rebuild the wait list. - */ - if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, - &pipe_event) < 0) { - rte_panic("Error adding fd to %d epoll_ctl, %s\n", - intr_pipe.readfd, strerror(errno)); - } - numfds++; - - rte_spinlock_lock(&intr_lock); - - TAILQ_FOREACH(src, &intr_sources, next) { - struct epoll_event ev; - - if (src->callbacks.tqh_first == NULL) - continue; /* skip those with no callbacks */ - memset(&ev, 0, sizeof(ev)); - ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; - ev.data.fd = src->intr_handle.fd; - - /** - * add all the uio device file descriptor - * into wait list. - */ - if (epoll_ctl(pfd, EPOLL_CTL_ADD, - src->intr_handle.fd, &ev) < 0){ - rte_panic("Error adding fd %d epoll_ctl, %s\n", - src->intr_handle.fd, strerror(errno)); - } - else - numfds++; - } - rte_spinlock_unlock(&intr_lock); - /* serve the interrupt */ - eal_intr_handle_interrupts(pfd, numfds); - - /** - * when we return, we need to rebuild the - * list of fds to monitor. - */ - close(pfd); - } -} - -int -rte_eal_intr_init(void) -{ - int ret = 0; - - /* init the global interrupt source head */ - TAILQ_INIT(&intr_sources); - - /** - * create a pipe which will be waited by epoll and notified to - * rebuild the wait list of epoll. - */ - if (pipe(intr_pipe.pipefd) < 0) { - rte_errno = errno; - return -1; - } - - /* create the host thread to wait/handle the interrupt */ - ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, - eal_intr_thread_main, NULL); - if (ret != 0) { - rte_errno = -ret; - RTE_LOG(ERR, EAL, - "Failed to create thread for interrupt handling\n"); - } - - return ret; -} - -static void -eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) -{ - union rte_intr_read_buffer buf; - int bytes_read = 0; - int nbytes; - - switch (intr_handle->type) { - case RTE_INTR_HANDLE_UIO: - case RTE_INTR_HANDLE_UIO_INTX: - bytes_read = sizeof(buf.uio_intr_count); - break; -#ifdef VFIO_PRESENT - case RTE_INTR_HANDLE_VFIO_MSIX: - case RTE_INTR_HANDLE_VFIO_MSI: - case RTE_INTR_HANDLE_VFIO_LEGACY: - bytes_read = sizeof(buf.vfio_intr_count); - break; -#endif - case RTE_INTR_HANDLE_VDEV: - bytes_read = intr_handle->efd_counter_size; - /* For vdev, number of bytes to read is set by driver */ - break; - case RTE_INTR_HANDLE_EXT: - return; - default: - bytes_read = 1; - RTE_LOG(INFO, EAL, "unexpected intr type\n"); - break; - } - - /** - * read out to clear the ready-to-be-read flag - * for epoll_wait. - */ - if (bytes_read == 0) - return; - do { - nbytes = read(fd, &buf, bytes_read); - if (nbytes < 0) { - if (errno == EINTR || errno == EWOULDBLOCK || - errno == EAGAIN) - continue; - RTE_LOG(ERR, EAL, - "Error reading from fd %d: %s\n", - fd, strerror(errno)); - } else if (nbytes == 0) - RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); - return; - } while (1); -} - -static int -eal_epoll_process_event(struct epoll_event *evs, unsigned int n, - struct rte_epoll_event *events) -{ - unsigned int i, count = 0; - struct rte_epoll_event *rev; - - for (i = 0; i < n; i++) { - rev = evs[i].data.ptr; - if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, - RTE_EPOLL_EXEC)) - continue; - - events[count].status = RTE_EPOLL_VALID; - events[count].fd = rev->fd; - events[count].epfd = rev->epfd; - events[count].epdata.event = rev->epdata.event; - events[count].epdata.data = rev->epdata.data; - if (rev->epdata.cb_fun) - rev->epdata.cb_fun(rev->fd, - rev->epdata.cb_arg); - - rte_compiler_barrier(); - rev->status = RTE_EPOLL_VALID; - count++; - } - return count; -} - -static inline int -eal_init_tls_epfd(void) -{ - int pfd = epoll_create(255); - - if (pfd < 0) { - RTE_LOG(ERR, EAL, - "Cannot create epoll instance\n"); - return -1; - } - return pfd; -} - -int -rte_intr_tls_epfd(void) -{ - if (RTE_PER_LCORE(_epfd) == -1) - RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); - - return RTE_PER_LCORE(_epfd); -} - -int -rte_epoll_wait(int epfd, struct rte_epoll_event *events, - int maxevents, int timeout) -{ - struct epoll_event evs[maxevents]; - int rc; - - if (!events) { - RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); - return -1; - } - - /* using per thread epoll fd */ - if (epfd == RTE_EPOLL_PER_THREAD) - epfd = rte_intr_tls_epfd(); - - while (1) { - rc = epoll_wait(epfd, evs, maxevents, timeout); - if (likely(rc > 0)) { - /* epoll_wait has at least one fd ready to read */ - rc = eal_epoll_process_event(evs, rc, events); - break; - } else if (rc < 0) { - if (errno == EINTR) - continue; - /* epoll_wait fail */ - RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", - strerror(errno)); - rc = -1; - break; - } else { - /* rc == 0, epoll_wait timed out */ - break; - } - } - - return rc; -} - -static inline void -eal_epoll_data_safe_free(struct rte_epoll_event *ev) -{ - while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, - RTE_EPOLL_INVALID)) - while (ev->status != RTE_EPOLL_VALID) - rte_pause(); - memset(&ev->epdata, 0, sizeof(ev->epdata)); - ev->fd = -1; - ev->epfd = -1; -} - -int -rte_epoll_ctl(int epfd, int op, int fd, - struct rte_epoll_event *event) -{ - struct epoll_event ev; - - if (!event) { - RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); - return -1; - } - - /* using per thread epoll fd */ - if (epfd == RTE_EPOLL_PER_THREAD) - epfd = rte_intr_tls_epfd(); - - if (op == EPOLL_CTL_ADD) { - event->status = RTE_EPOLL_VALID; - event->fd = fd; /* ignore fd in event */ - event->epfd = epfd; - ev.data.ptr = (void *)event; - } - - ev.events = event->epdata.event; - if (epoll_ctl(epfd, op, fd, &ev) < 0) { - RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", - op, fd, strerror(errno)); - if (op == EPOLL_CTL_ADD) - /* rollback status when CTL_ADD fail */ - event->status = RTE_EPOLL_INVALID; - return -1; - } - - if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) - eal_epoll_data_safe_free(event); - - return 0; -} - -int -rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, - int op, unsigned int vec, void *data) -{ - struct rte_epoll_event *rev; - struct rte_epoll_data *epdata; - int epfd_op; - unsigned int efd_idx; - int rc = 0; - - efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? - (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; - - if (!intr_handle || intr_handle->nb_efd == 0 || - efd_idx >= intr_handle->nb_efd) { - RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); - return -EPERM; - } - - switch (op) { - case RTE_INTR_EVENT_ADD: - epfd_op = EPOLL_CTL_ADD; - rev = &intr_handle->elist[efd_idx]; - if (rev->status != RTE_EPOLL_INVALID) { - RTE_LOG(INFO, EAL, "Event already been added.\n"); - return -EEXIST; - } - - /* attach to intr vector fd */ - epdata = &rev->epdata; - epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; - epdata->data = data; - epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; - epdata->cb_arg = (void *)intr_handle; - rc = rte_epoll_ctl(epfd, epfd_op, - intr_handle->efds[efd_idx], rev); - if (!rc) - RTE_LOG(DEBUG, EAL, - "efd %d associated with vec %d added on epfd %d" - "\n", rev->fd, vec, epfd); - else - rc = -EPERM; - break; - case RTE_INTR_EVENT_DEL: - epfd_op = EPOLL_CTL_DEL; - rev = &intr_handle->elist[efd_idx]; - if (rev->status == RTE_EPOLL_INVALID) { - RTE_LOG(INFO, EAL, "Event does not exist.\n"); - return -EPERM; - } - - rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); - if (rc) - rc = -EPERM; - break; - default: - RTE_LOG(ERR, EAL, "event op type mismatch\n"); - rc = -EPERM; - } - - return rc; -} - -void -rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) -{ - uint32_t i; - struct rte_epoll_event *rev; - - for (i = 0; i < intr_handle->nb_efd; i++) { - rev = &intr_handle->elist[i]; - if (rev->status == RTE_EPOLL_INVALID) - continue; - if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { - /* force free if the entry valid */ - eal_epoll_data_safe_free(rev); - rev->status = RTE_EPOLL_INVALID; - } - } -} - -int -rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) -{ - uint32_t i; - int fd; - uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); - - assert(nb_efd != 0); - - if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { - for (i = 0; i < n; i++) { - fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); - if (fd < 0) { - RTE_LOG(ERR, EAL, - "can't setup eventfd, error %i (%s)\n", - errno, strerror(errno)); - return -errno; - } - intr_handle->efds[i] = fd; - } - intr_handle->nb_efd = n; - intr_handle->max_intr = NB_OTHER_INTR + n; - } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { - /* only check, initialization would be done in vdev driver.*/ - if (intr_handle->efd_counter_size > - sizeof(union rte_intr_read_buffer)) { - RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); - return -EINVAL; - } - } else { - intr_handle->efds[0] = intr_handle->fd; - intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); - intr_handle->max_intr = NB_OTHER_INTR; - } - - return 0; -} - -void -rte_intr_efd_disable(struct rte_intr_handle *intr_handle) -{ - uint32_t i; - - rte_intr_free_epoll_fd(intr_handle); - if (intr_handle->max_intr > intr_handle->nb_efd) { - for (i = 0; i < intr_handle->nb_efd; i++) - close(intr_handle->efds[i]); - } - intr_handle->nb_efd = 0; - intr_handle->max_intr = 0; -} - -int -rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) -{ - return !(!intr_handle->nb_efd); -} - -int -rte_intr_allow_others(struct rte_intr_handle *intr_handle) -{ - if (!rte_intr_dp_is_en(intr_handle)) - return 1; - else - return !!(intr_handle->max_intr - intr_handle->nb_efd); -} - -int -rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) -{ - if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) - return 1; - - if (intr_handle->type == RTE_INTR_HANDLE_VDEV) - return 1; - - return 0; -} - -int rte_thread_is_intr(void) -{ - return pthread_equal(intr_thread, pthread_self()); -} diff --git a/lib/librte_eal/linux/eal/eal_lcore.c b/lib/librte_eal/linux/eal/eal_lcore.c deleted file mode 100644 index bc8965844c..0000000000 --- a/lib/librte_eal/linux/eal/eal_lcore.c +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_filesystem.h" -#include "eal_thread.h" - -#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" -#define CORE_ID_FILE "topology/core_id" -#define NUMA_NODE_PATH "/sys/devices/system/node" - -/* Check if a cpu is present by the presence of the cpu information for it */ -int -eal_cpu_detected(unsigned lcore_id) -{ - char path[PATH_MAX]; - int len = snprintf(path, sizeof(path), SYS_CPU_DIR - "/"CORE_ID_FILE, lcore_id); - if (len <= 0 || (unsigned)len >= sizeof(path)) - return 0; - if (access(path, F_OK) != 0) - return 0; - - return 1; -} - -/* - * Get CPU socket id (NUMA node) for a logical core. - * - * This searches each nodeX directories in /sys for the symlink for the given - * lcore_id and returns the numa node where the lcore is found. If lcore is not - * found on any numa node, returns zero. - */ -unsigned -eal_cpu_socket_id(unsigned lcore_id) -{ - unsigned socket; - - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { - char path[PATH_MAX]; - - snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, - socket, lcore_id); - if (access(path, F_OK) == 0) - return socket; - } - return 0; -} - -/* Get the cpu core id value from the /sys/.../cpuX core_id value */ -unsigned -eal_cpu_core_id(unsigned lcore_id) -{ - char path[PATH_MAX]; - unsigned long id; - - int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); - if (len <= 0 || (unsigned)len >= sizeof(path)) - goto err; - if (eal_parse_sysfs_value(path, &id) != 0) - goto err; - return (unsigned)id; - -err: - RTE_LOG(ERR, EAL, "Error reading core id value from %s " - "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); - return 0; -} diff --git a/lib/librte_eal/linux/eal/eal_log.c b/lib/librte_eal/linux/eal/eal_log.c deleted file mode 100644 index 9d02dddbed..0000000000 --- a/lib/librte_eal/linux/eal/eal_log.c +++ /dev/null @@ -1,62 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" - -/* - * default log function - */ -static ssize_t -console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) -{ - ssize_t ret; - - /* write on stdout */ - ret = fwrite(buf, 1, size, stdout); - fflush(stdout); - - /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ - syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); - - return ret; -} - -static cookie_io_functions_t console_log_func = { - .write = console_log_write, -}; - -/* - * set the log to default function, called during eal init process, - * once memzones are available. - */ -int -rte_eal_log_init(const char *id, int facility) -{ - FILE *log_stream; - - log_stream = fopencookie(NULL, "w+", console_log_func); - if (log_stream == NULL) - return -1; - - openlog(id, LOG_NDELAY | LOG_PID, facility); - - eal_log_set_default(log_stream); - - return 0; -} diff --git a/lib/librte_eal/linux/eal/eal_memalloc.c b/lib/librte_eal/linux/eal/eal_memalloc.c deleted file mode 100644 index af6d0d023a..0000000000 --- a/lib/librte_eal/linux/eal/eal_memalloc.c +++ /dev/null @@ -1,1604 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2017-2018 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ -#include -#define MEMFD_SUPPORTED -#endif -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -#include -#include -#endif -#include -#include /* for hugetlb-related mmap flags */ - -#include -#include -#include -#include -#include -#include - -#include "eal_filesystem.h" -#include "eal_internal_cfg.h" -#include "eal_memalloc.h" -#include "eal_memcfg.h" -#include "eal_private.h" - -const int anonymous_hugepages_supported = -#ifdef MAP_HUGE_SHIFT - 1; -#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT -#else - 0; -#define RTE_MAP_HUGE_SHIFT 26 -#endif - -/* - * we've already checked memfd support at compile-time, but we also need to - * check if we can create hugepage files with memfd. - * - * also, this is not a constant, because while we may be *compiled* with memfd - * hugetlbfs support, we might not be *running* on a system that supports memfd - * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at - * runtime, and fall back to anonymous memory. - */ -static int memfd_create_supported = -#ifdef MFD_HUGETLB - 1; -#define RTE_MFD_HUGETLB MFD_HUGETLB -#else - 0; -#define RTE_MFD_HUGETLB 4U -#endif - -/* - * not all kernel version support fallocate on hugetlbfs, so fall back to - * ftruncate and disallow deallocation if fallocate is not supported. - */ -static int fallocate_supported = -1; /* unknown */ - -/* - * we have two modes - single file segments, and file-per-page mode. - * - * for single-file segments, we use memseg_list_fd to store the segment fd, - * while the fds[] will not be allocated, and len will be set to 0. - * - * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' - * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. - * - * we cannot know how many pages a system will have in advance, but we do know - * that they come in lists, and we know lengths of these lists. so, simply store - * a malloc'd array of fd's indexed by list and segment index. - * - * they will be initialized at startup, and filled as we allocate/deallocate - * segments. - */ -static struct { - int *fds; /**< dynamically allocated array of segment lock fd's */ - int memseg_list_fd; /**< memseg list fd */ - int len; /**< total length of the array */ - int count; /**< entries used in an array */ -} fd_list[RTE_MAX_MEMSEG_LISTS]; - -/** local copy of a memory map, used to synchronize memory hotplug in MP */ -static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; - -static sigjmp_buf huge_jmpenv; - -static void __rte_unused huge_sigbus_handler(int signo __rte_unused) -{ - siglongjmp(huge_jmpenv, 1); -} - -/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, - * non-static local variable in the stack frame calling sigsetjmp might be - * clobbered by a call to longjmp. - */ -static int __rte_unused huge_wrap_sigsetjmp(void) -{ - return sigsetjmp(huge_jmpenv, 1); -} - -static struct sigaction huge_action_old; -static int huge_need_recover; - -static void __rte_unused -huge_register_sigbus(void) -{ - sigset_t mask; - struct sigaction action; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = 0; - action.sa_mask = mask; - action.sa_handler = huge_sigbus_handler; - - huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); -} - -static void __rte_unused -huge_recover_sigbus(void) -{ - if (huge_need_recover) { - sigaction(SIGBUS, &huge_action_old, NULL); - huge_need_recover = 0; - } -} - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -static bool -check_numa(void) -{ - bool ret = true; - /* Check if kernel supports NUMA. */ - if (numa_available() != 0) { - RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); - ret = false; - } - return ret; -} - -static void -prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) -{ - RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); - if (get_mempolicy(oldpolicy, oldmask->maskp, - oldmask->size + 1, 0, 0) < 0) { - RTE_LOG(ERR, EAL, - "Failed to get current mempolicy: %s. " - "Assuming MPOL_DEFAULT.\n", strerror(errno)); - *oldpolicy = MPOL_DEFAULT; - } - RTE_LOG(DEBUG, EAL, - "Setting policy MPOL_PREFERRED for socket %d\n", - socket_id); - numa_set_preferred(socket_id); -} - -static void -restore_numa(int *oldpolicy, struct bitmask *oldmask) -{ - RTE_LOG(DEBUG, EAL, - "Restoring previous memory policy: %d\n", *oldpolicy); - if (*oldpolicy == MPOL_DEFAULT) { - numa_set_localalloc(); - } else if (set_mempolicy(*oldpolicy, oldmask->maskp, - oldmask->size + 1) < 0) { - RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", - strerror(errno)); - numa_set_localalloc(); - } - numa_free_cpumask(oldmask); -} -#endif - -/* - * uses fstat to report the size of a file on disk - */ -static off_t -get_file_size(int fd) -{ - struct stat st; - if (fstat(fd, &st) < 0) - return 0; - return st.st_size; -} - -static int -pagesz_flags(uint64_t page_sz) -{ - /* as per mmap() manpage, all page sizes are log2 of page size - * shifted by MAP_HUGE_SHIFT - */ - int log2 = rte_log2_u64(page_sz); - return log2 << RTE_MAP_HUGE_SHIFT; -} - -/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ -static int lock(int fd, int type) -{ - int ret; - - /* flock may be interrupted */ - do { - ret = flock(fd, type | LOCK_NB); - } while (ret && errno == EINTR); - - if (ret && errno == EWOULDBLOCK) { - /* couldn't lock */ - return 0; - } else if (ret) { - RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", - __func__, strerror(errno)); - return -1; - } - /* lock was successful */ - return 1; -} - -static int -get_seg_memfd(struct hugepage_info *hi __rte_unused, - unsigned int list_idx __rte_unused, - unsigned int seg_idx __rte_unused) -{ -#ifdef MEMFD_SUPPORTED - int fd; - char segname[250]; /* as per manpage, limit is 249 bytes plus null */ - - int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); - - if (internal_config.single_file_segments) { - fd = fd_list[list_idx].memseg_list_fd; - - if (fd < 0) { - snprintf(segname, sizeof(segname), "seg_%i", list_idx); - fd = memfd_create(segname, flags); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", - __func__, strerror(errno)); - return -1; - } - fd_list[list_idx].memseg_list_fd = fd; - } - } else { - fd = fd_list[list_idx].fds[seg_idx]; - - if (fd < 0) { - snprintf(segname, sizeof(segname), "seg_%i-%i", - list_idx, seg_idx); - fd = memfd_create(segname, flags); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", - __func__, strerror(errno)); - return -1; - } - fd_list[list_idx].fds[seg_idx] = fd; - } - } - return fd; -#endif - return -1; -} - -static int -get_seg_fd(char *path, int buflen, struct hugepage_info *hi, - unsigned int list_idx, unsigned int seg_idx) -{ - int fd; - - /* for in-memory mode, we only make it here when we're sure we support - * memfd, and this is a special case. - */ - if (internal_config.in_memory) - return get_seg_memfd(hi, list_idx, seg_idx); - - if (internal_config.single_file_segments) { - /* create a hugepage file path */ - eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); - - fd = fd_list[list_idx].memseg_list_fd; - - if (fd < 0) { - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", - __func__, strerror(errno)); - return -1; - } - /* take out a read lock and keep it indefinitely */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; - } - fd_list[list_idx].memseg_list_fd = fd; - } - } else { - /* create a hugepage file path */ - eal_get_hugefile_path(path, buflen, hi->hugedir, - list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); - - fd = fd_list[list_idx].fds[seg_idx]; - - if (fd < 0) { - fd = open(path, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", - __func__, strerror(errno)); - return -1; - } - /* take out a read lock */ - if (lock(fd, LOCK_SH) < 0) { - RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", - __func__, strerror(errno)); - close(fd); - return -1; - } - fd_list[list_idx].fds[seg_idx] = fd; - } - } - return fd; -} - -static int -resize_hugefile_in_memory(int fd, uint64_t fa_offset, - uint64_t page_sz, bool grow) -{ - int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_KEEP_SIZE; - int ret; - - /* grow or shrink the file */ - ret = fallocate(fd, flags, fa_offset, page_sz); - - if (ret < 0) { - RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", - __func__, - strerror(errno)); - return -1; - } - return 0; -} - -static int -resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, - bool grow) -{ - bool again = false; - - do { - if (fallocate_supported == 0) { - /* we cannot deallocate memory if fallocate() is not - * supported, and hugepage file is already locked at - * creation, so no further synchronization needed. - */ - - if (!grow) { - RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", - __func__); - return -1; - } - uint64_t new_size = fa_offset + page_sz; - uint64_t cur_size = get_file_size(fd); - - /* fallocate isn't supported, fall back to ftruncate */ - if (new_size > cur_size && - ftruncate(fd, new_size) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - return -1; - } - } else { - int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_KEEP_SIZE; - int ret; - - /* - * technically, it is perfectly safe for both primary - * and secondary to grow and shrink the page files: - * growing the file repeatedly has no effect because - * a page can only be allocated once, while mmap ensures - * that secondaries hold on to the page even after the - * page itself is removed from the filesystem. - * - * however, leaving growing/shrinking to the primary - * tends to expose bugs in fdlist page count handling, - * so leave this here just in case. - */ - if (rte_eal_process_type() != RTE_PROC_PRIMARY) - return 0; - - /* grow or shrink the file */ - ret = fallocate(fd, flags, fa_offset, page_sz); - - if (ret < 0) { - if (fallocate_supported == -1 && - errno == ENOTSUP) { - RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", - __func__); - again = true; - fallocate_supported = 0; - } else { - RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", - __func__, - strerror(errno)); - return -1; - } - } else - fallocate_supported = 1; - } - } while (again); - - return 0; -} - -static void -close_hugefile(int fd, char *path, int list_idx) -{ - /* - * primary process must unlink the file, but only when not in in-memory - * mode (as in that case there is no file to unlink). - */ - if (!internal_config.in_memory && - rte_eal_process_type() == RTE_PROC_PRIMARY && - unlink(path)) - RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", - __func__, path, strerror(errno)); - - close(fd); - fd_list[list_idx].memseg_list_fd = -1; -} - -static int -resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow) -{ - /* in-memory mode is a special case, because we can be sure that - * fallocate() is supported. - */ - if (internal_config.in_memory) - return resize_hugefile_in_memory(fd, fa_offset, - page_sz, grow); - - return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, - grow); -} - -static int -alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, - struct hugepage_info *hi, unsigned int list_idx, - unsigned int seg_idx) -{ -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - int cur_socket_id = 0; -#endif - uint64_t map_offset; - rte_iova_t iova; - void *va; - char path[PATH_MAX]; - int ret = 0; - int fd; - size_t alloc_sz; - int flags; - void *new_addr; - - alloc_sz = hi->hugepage_sz; - - /* these are checked at init, but code analyzers don't know that */ - if (internal_config.in_memory && !anonymous_hugepages_supported) { - RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); - return -1; - } - if (internal_config.in_memory && !memfd_create_supported && - internal_config.single_file_segments) { - RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); - return -1; - } - - /* in-memory without memfd is a special case */ - int mmap_flags; - - if (internal_config.in_memory && !memfd_create_supported) { - const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | - MAP_PRIVATE | MAP_ANONYMOUS; - int pagesz_flag; - - pagesz_flag = pagesz_flags(alloc_sz); - fd = -1; - mmap_flags = in_memory_flags | pagesz_flag; - - /* single-file segments codepath will never be active - * here because in-memory mode is incompatible with the - * fallback path, and it's stopped at EAL initialization - * stage. - */ - map_offset = 0; - } else { - /* takes out a read lock on segment or segment list */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); - return -1; - } - - if (internal_config.single_file_segments) { - map_offset = seg_idx * alloc_sz; - ret = resize_hugefile(fd, map_offset, alloc_sz, true); - if (ret < 0) - goto resized; - - fd_list[list_idx].count++; - } else { - map_offset = 0; - if (ftruncate(fd, alloc_sz) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - if (internal_config.hugepage_unlink && - !internal_config.in_memory) { - if (unlink(path)) { - RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", - __func__, strerror(errno)); - goto resized; - } - } - } - mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; - } - - /* - * map the segment, and populate page tables, the kernel fills - * this segment with zeros if it's a new page. - */ - va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, - map_offset); - - if (va == MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, - strerror(errno)); - /* mmap failed, but the previous region might have been - * unmapped anyway. try to remap it - */ - goto unmapped; - } - if (va != addr) { - RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); - munmap(va, alloc_sz); - goto resized; - } - - /* In linux, hugetlb limitations, like cgroup, are - * enforced at fault time instead of mmap(), even - * with the option of MAP_POPULATE. Kernel will send - * a SIGBUS signal. To avoid to be killed, save stack - * environment here, if SIGBUS happens, we can jump - * back here. - */ - if (huge_wrap_sigsetjmp()) { - RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", - (unsigned int)(alloc_sz >> 20)); - goto mapped; - } - - /* we need to trigger a write to the page to enforce page fault and - * ensure that page is accessible to us, but we can't overwrite value - * that is already there, so read the old value, and write itback. - * kernel populates the page with zeroes initially. - */ - *(volatile int *)addr = *(volatile int *)addr; - - iova = rte_mem_virt2iova(addr); - if (iova == RTE_BAD_PHYS_ADDR) { - RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", - __func__); - goto mapped; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, - MPOL_F_NODE | MPOL_F_ADDR); - if (ret < 0) { - RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", - __func__, strerror(errno)); - goto mapped; - } else if (cur_socket_id != socket_id) { - RTE_LOG(DEBUG, EAL, - "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", - __func__, socket_id, cur_socket_id); - goto mapped; - } -#else - if (rte_socket_count() > 1) - RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", - __func__); -#endif - - ms->addr = addr; - ms->hugepage_sz = alloc_sz; - ms->len = alloc_sz; - ms->nchannel = rte_memory_get_nchannel(); - ms->nrank = rte_memory_get_nrank(); - ms->iova = iova; - ms->socket_id = socket_id; - - return 0; - -mapped: - munmap(addr, alloc_sz); -unmapped: - flags = MAP_FIXED; - new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); - if (new_addr != addr) { - if (new_addr != NULL) - munmap(new_addr, alloc_sz); - /* we're leaving a hole in our virtual address space. if - * somebody else maps this hole now, we could accidentally - * override it in the future. - */ - RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); - } - /* roll back the ref count */ - if (internal_config.single_file_segments) - fd_list[list_idx].count--; -resized: - /* some codepaths will return negative fd, so exit early */ - if (fd < 0) - return -1; - - if (internal_config.single_file_segments) { - resize_hugefile(fd, map_offset, alloc_sz, false); - /* ignore failure, can't make it any worse */ - - /* if refcount is at zero, close the file */ - if (fd_list[list_idx].count == 0) - close_hugefile(fd, path, list_idx); - } else { - /* only remove file if we can take out a write lock */ - if (internal_config.hugepage_unlink == 0 && - internal_config.in_memory == 0 && - lock(fd, LOCK_EX) == 1) - unlink(path); - close(fd); - fd_list[list_idx].fds[seg_idx] = -1; - } - return -1; -} - -static int -free_seg(struct rte_memseg *ms, struct hugepage_info *hi, - unsigned int list_idx, unsigned int seg_idx) -{ - uint64_t map_offset; - char path[PATH_MAX]; - int fd, ret = 0; - bool exit_early; - - /* erase page data */ - memset(ms->addr, 0, ms->len); - - if (mmap(ms->addr, ms->len, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == - MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); - return -1; - } - - exit_early = false; - - /* if we're using anonymous hugepages, nothing to be done */ - if (internal_config.in_memory && !memfd_create_supported) - exit_early = true; - - /* if we've already unlinked the page, nothing needs to be done */ - if (!internal_config.in_memory && internal_config.hugepage_unlink) - exit_early = true; - - if (exit_early) { - memset(ms, 0, sizeof(*ms)); - return 0; - } - - /* if we are not in single file segments mode, we're going to unmap the - * segment and thus drop the lock on original fd, but hugepage dir is - * now locked so we can take out another one without races. - */ - fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); - if (fd < 0) - return -1; - - if (internal_config.single_file_segments) { - map_offset = seg_idx * ms->len; - if (resize_hugefile(fd, map_offset, ms->len, false)) - return -1; - - if (--(fd_list[list_idx].count) == 0) - close_hugefile(fd, path, list_idx); - - ret = 0; - } else { - /* if we're able to take out a write lock, we're the last one - * holding onto this page. - */ - if (!internal_config.in_memory) { - ret = lock(fd, LOCK_EX); - if (ret >= 0) { - /* no one else is using this page */ - if (ret == 1) - unlink(path); - } - } - /* closing fd will drop the lock */ - close(fd); - fd_list[list_idx].fds[seg_idx] = -1; - } - - memset(ms, 0, sizeof(*ms)); - - return ret < 0 ? -1 : 0; -} - -struct alloc_walk_param { - struct hugepage_info *hi; - struct rte_memseg **ms; - size_t page_sz; - unsigned int segs_allocated; - unsigned int n_segs; - int socket; - bool exact; -}; -static int -alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct alloc_walk_param *wa = arg; - struct rte_memseg_list *cur_msl; - size_t page_sz; - int cur_idx, start_idx, j, dir_fd = -1; - unsigned int msl_idx, need, i; - - if (msl->page_sz != wa->page_sz) - return 0; - if (msl->socket_id != wa->socket) - return 0; - - page_sz = (size_t)msl->page_sz; - - msl_idx = msl - mcfg->memsegs; - cur_msl = &mcfg->memsegs[msl_idx]; - - need = wa->n_segs; - - /* try finding space in memseg list */ - if (wa->exact) { - /* if we require exact number of pages in a list, find them */ - cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, - need); - if (cur_idx < 0) - return 0; - start_idx = cur_idx; - } else { - int cur_len; - - /* we don't require exact number of pages, so we're going to go - * for best-effort allocation. that means finding the biggest - * unused block, and going with that. - */ - cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, - 0); - if (cur_idx < 0) - return 0; - start_idx = cur_idx; - /* adjust the size to possibly be smaller than original - * request, but do not allow it to be bigger. - */ - cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, - cur_idx); - need = RTE_MIN(need, (unsigned int)cur_len); - } - - /* do not allow any page allocations during the time we're allocating, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - * - * during init, we already hold a write lock, so don't try to take out - * another one. - */ - if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { - dir_fd = open(wa->hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - } - - for (i = 0; i < need; i++, cur_idx++) { - struct rte_memseg *cur; - void *map_addr; - - cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); - map_addr = RTE_PTR_ADD(cur_msl->base_va, - cur_idx * page_sz); - - if (alloc_seg(cur, map_addr, wa->socket, wa->hi, - msl_idx, cur_idx)) { - RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", - need, i); - - /* if exact number wasn't requested, stop */ - if (!wa->exact) - goto out; - - /* clean up */ - for (j = start_idx; j < cur_idx; j++) { - struct rte_memseg *tmp; - struct rte_fbarray *arr = - &cur_msl->memseg_arr; - - tmp = rte_fbarray_get(arr, j); - rte_fbarray_set_free(arr, j); - - /* free_seg may attempt to create a file, which - * may fail. - */ - if (free_seg(tmp, wa->hi, msl_idx, j)) - RTE_LOG(DEBUG, EAL, "Cannot free page\n"); - } - /* clear the list */ - if (wa->ms) - memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); - - if (dir_fd >= 0) - close(dir_fd); - return -1; - } - if (wa->ms) - wa->ms[i] = cur; - - rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); - } -out: - wa->segs_allocated = i; - if (i > 0) - cur_msl->version++; - if (dir_fd >= 0) - close(dir_fd); - /* if we didn't allocate any segments, move on to the next list */ - return i > 0; -} - -struct free_walk_param { - struct hugepage_info *hi; - struct rte_memseg *ms; -}; -static int -free_seg_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *found_msl; - struct free_walk_param *wa = arg; - uintptr_t start_addr, end_addr; - int msl_idx, seg_idx, ret, dir_fd = -1; - - start_addr = (uintptr_t) msl->base_va; - end_addr = start_addr + msl->len; - - if ((uintptr_t)wa->ms->addr < start_addr || - (uintptr_t)wa->ms->addr >= end_addr) - return 0; - - msl_idx = msl - mcfg->memsegs; - seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; - - /* msl is const */ - found_msl = &mcfg->memsegs[msl_idx]; - - /* do not allow any page allocations during the time we're freeing, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - * - * during init, we already hold a write lock, so don't try to take out - * another one. - */ - if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { - dir_fd = open(wa->hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", - __func__, wa->hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - } - - found_msl->version++; - - rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); - - ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); - - if (dir_fd >= 0) - close(dir_fd); - - if (ret < 0) - return -1; - - return 1; -} - -int -eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, - int socket, bool exact) -{ - int i, ret = -1; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - bool have_numa = false; - int oldpolicy; - struct bitmask *oldmask; -#endif - struct alloc_walk_param wa; - struct hugepage_info *hi = NULL; - - memset(&wa, 0, sizeof(wa)); - - /* dynamic allocation not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { - if (page_sz == - internal_config.hugepage_info[i].hugepage_sz) { - hi = &internal_config.hugepage_info[i]; - break; - } - } - if (!hi) { - RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", - __func__); - return -1; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (check_numa()) { - oldmask = numa_allocate_nodemask(); - prepare_numa(&oldpolicy, oldmask, socket); - have_numa = true; - } -#endif - - wa.exact = exact; - wa.hi = hi; - wa.ms = ms; - wa.n_segs = n_segs; - wa.page_sz = page_sz; - wa.socket = socket; - wa.segs_allocated = 0; - - /* memalloc is locked, so it's safe to use thread-unsafe version */ - ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); - if (ret == 0) { - RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", - __func__); - ret = -1; - } else if (ret > 0) { - ret = (int)wa.segs_allocated; - } - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (have_numa) - restore_numa(&oldpolicy, oldmask); -#endif - return ret; -} - -struct rte_memseg * -eal_memalloc_alloc_seg(size_t page_sz, int socket) -{ - struct rte_memseg *ms; - if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) - return NULL; - /* return pointer to newly allocated memseg */ - return ms; -} - -int -eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) -{ - int seg, ret = 0; - - /* dynamic free not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - for (seg = 0; seg < n_segs; seg++) { - struct rte_memseg *cur = ms[seg]; - struct hugepage_info *hi = NULL; - struct free_walk_param wa; - int i, walk_res; - - /* if this page is marked as unfreeable, fail */ - if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { - RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); - ret = -1; - continue; - } - - memset(&wa, 0, sizeof(wa)); - - for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); - i++) { - hi = &internal_config.hugepage_info[i]; - if (cur->hugepage_sz == hi->hugepage_sz) - break; - } - if (i == (int)RTE_DIM(internal_config.hugepage_info)) { - RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); - ret = -1; - continue; - } - - wa.ms = cur; - wa.hi = hi; - - /* memalloc is locked, so it's safe to use thread-unsafe version - */ - walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, - &wa); - if (walk_res == 1) - continue; - if (walk_res == 0) - RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); - ret = -1; - } - return ret; -} - -int -eal_memalloc_free_seg(struct rte_memseg *ms) -{ - /* dynamic free not supported in legacy mode */ - if (internal_config.legacy_mem) - return -1; - - return eal_memalloc_free_seg_bulk(&ms, 1); -} - -static int -sync_chunk(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx, bool used, int start, int end) -{ - struct rte_fbarray *l_arr, *p_arr; - int i, ret, chunk_len, diff_len; - - l_arr = &local_msl->memseg_arr; - p_arr = &primary_msl->memseg_arr; - - /* we need to aggregate allocations/deallocations into bigger chunks, - * as we don't want to spam the user with per-page callbacks. - * - * to avoid any potential issues, we also want to trigger - * deallocation callbacks *before* we actually deallocate - * memory, so that the user application could wrap up its use - * before it goes away. - */ - - chunk_len = end - start; - - /* find how many contiguous pages we can map/unmap for this chunk */ - diff_len = used ? - rte_fbarray_find_contig_free(l_arr, start) : - rte_fbarray_find_contig_used(l_arr, start); - - /* has to be at least one page */ - if (diff_len < 1) - return -1; - - diff_len = RTE_MIN(chunk_len, diff_len); - - /* if we are freeing memory, notify the application */ - if (!used) { - struct rte_memseg *ms; - void *start_va; - size_t len, page_sz; - - ms = rte_fbarray_get(l_arr, start); - start_va = ms->addr; - page_sz = (size_t)primary_msl->page_sz; - len = page_sz * diff_len; - - eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, - start_va, len); - } - - for (i = 0; i < diff_len; i++) { - struct rte_memseg *p_ms, *l_ms; - int seg_idx = start + i; - - l_ms = rte_fbarray_get(l_arr, seg_idx); - p_ms = rte_fbarray_get(p_arr, seg_idx); - - if (l_ms == NULL || p_ms == NULL) - return -1; - - if (used) { - ret = alloc_seg(l_ms, p_ms->addr, - p_ms->socket_id, hi, - msl_idx, seg_idx); - if (ret < 0) - return -1; - rte_fbarray_set_used(l_arr, seg_idx); - } else { - ret = free_seg(l_ms, hi, msl_idx, seg_idx); - rte_fbarray_set_free(l_arr, seg_idx); - if (ret < 0) - return -1; - } - } - - /* if we just allocated memory, notify the application */ - if (used) { - struct rte_memseg *ms; - void *start_va; - size_t len, page_sz; - - ms = rte_fbarray_get(l_arr, start); - start_va = ms->addr; - page_sz = (size_t)primary_msl->page_sz; - len = page_sz * diff_len; - - eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, - start_va, len); - } - - /* calculate how much we can advance until next chunk */ - diff_len = used ? - rte_fbarray_find_contig_used(l_arr, start) : - rte_fbarray_find_contig_free(l_arr, start); - ret = RTE_MIN(chunk_len, diff_len); - - return ret; -} - -static int -sync_status(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx, bool used) -{ - struct rte_fbarray *l_arr, *p_arr; - int p_idx, l_chunk_len, p_chunk_len, ret; - int start, end; - - /* this is a little bit tricky, but the basic idea is - walk both lists - * and spot any places where there are discrepancies. walking both lists - * and noting discrepancies in a single go is a hard problem, so we do - * it in two passes - first we spot any places where allocated segments - * mismatch (i.e. ensure that everything that's allocated in the primary - * is also allocated in the secondary), and then we do it by looking at - * free segments instead. - * - * we also need to aggregate changes into chunks, as we have to call - * callbacks per allocation, not per page. - */ - l_arr = &local_msl->memseg_arr; - p_arr = &primary_msl->memseg_arr; - - if (used) - p_idx = rte_fbarray_find_next_used(p_arr, 0); - else - p_idx = rte_fbarray_find_next_free(p_arr, 0); - - while (p_idx >= 0) { - int next_chunk_search_idx; - - if (used) { - p_chunk_len = rte_fbarray_find_contig_used(p_arr, - p_idx); - l_chunk_len = rte_fbarray_find_contig_used(l_arr, - p_idx); - } else { - p_chunk_len = rte_fbarray_find_contig_free(p_arr, - p_idx); - l_chunk_len = rte_fbarray_find_contig_free(l_arr, - p_idx); - } - /* best case scenario - no differences (or bigger, which will be - * fixed during next iteration), look for next chunk - */ - if (l_chunk_len >= p_chunk_len) { - next_chunk_search_idx = p_idx + p_chunk_len; - goto next_chunk; - } - - /* if both chunks start at the same point, skip parts we know - * are identical, and sync the rest. each call to sync_chunk - * will only sync contiguous segments, so we need to call this - * until we are sure there are no more differences in this - * chunk. - */ - start = p_idx + l_chunk_len; - end = p_idx + p_chunk_len; - do { - ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, - used, start, end); - start += ret; - } while (start < end && ret >= 0); - /* if ret is negative, something went wrong */ - if (ret < 0) - return -1; - - next_chunk_search_idx = p_idx + p_chunk_len; -next_chunk: - /* skip to end of this chunk */ - if (used) { - p_idx = rte_fbarray_find_next_used(p_arr, - next_chunk_search_idx); - } else { - p_idx = rte_fbarray_find_next_free(p_arr, - next_chunk_search_idx); - } - } - return 0; -} - -static int -sync_existing(struct rte_memseg_list *primary_msl, - struct rte_memseg_list *local_msl, struct hugepage_info *hi, - unsigned int msl_idx) -{ - int ret, dir_fd; - - /* do not allow any page allocations during the time we're allocating, - * because file creation and locking operations are not atomic, - * and we might be the first or the last ones to use a particular page, - * so we need to ensure atomicity of every operation. - */ - dir_fd = open(hi->hugedir, O_RDONLY); - if (dir_fd < 0) { - RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, - hi->hugedir, strerror(errno)); - return -1; - } - /* blocking writelock */ - if (flock(dir_fd, LOCK_EX)) { - RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, - hi->hugedir, strerror(errno)); - close(dir_fd); - return -1; - } - - /* ensure all allocated space is the same in both lists */ - ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); - if (ret < 0) - goto fail; - - /* ensure all unallocated space is the same in both lists */ - ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); - if (ret < 0) - goto fail; - - /* update version number */ - local_msl->version = primary_msl->version; - - close(dir_fd); - - return 0; -fail: - close(dir_fd); - return -1; -} - -static int -sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *primary_msl, *local_msl; - struct hugepage_info *hi = NULL; - unsigned int i; - int msl_idx; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - primary_msl = &mcfg->memsegs[msl_idx]; - local_msl = &local_memsegs[msl_idx]; - - for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { - uint64_t cur_sz = - internal_config.hugepage_info[i].hugepage_sz; - uint64_t msl_sz = primary_msl->page_sz; - if (msl_sz == cur_sz) { - hi = &internal_config.hugepage_info[i]; - break; - } - } - if (!hi) { - RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); - return -1; - } - - /* if versions don't match, synchronize everything */ - if (local_msl->version != primary_msl->version && - sync_existing(primary_msl, local_msl, hi, msl_idx)) - return -1; - return 0; -} - - -int -eal_memalloc_sync_with_primary(void) -{ - /* nothing to be done in primary */ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) - return 0; - - /* memalloc is locked, so it's safe to call thread-unsafe version */ - if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) - return -1; - return 0; -} - -static int -secondary_msl_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *primary_msl, *local_msl; - char name[PATH_MAX]; - int msl_idx, ret; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - primary_msl = &mcfg->memsegs[msl_idx]; - local_msl = &local_memsegs[msl_idx]; - - /* create distinct fbarrays for each secondary */ - snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", - primary_msl->memseg_arr.name, getpid()); - - ret = rte_fbarray_init(&local_msl->memseg_arr, name, - primary_msl->memseg_arr.len, - primary_msl->memseg_arr.elt_sz); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); - return -1; - } - local_msl->base_va = primary_msl->base_va; - local_msl->len = primary_msl->len; - - return 0; -} - -static int -alloc_list(int list_idx, int len) -{ - int *data; - int i; - - /* single-file segments mode does not need fd list */ - if (!internal_config.single_file_segments) { - /* ensure we have space to store fd per each possible segment */ - data = malloc(sizeof(int) * len); - if (data == NULL) { - RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); - return -1; - } - /* set all fd's as invalid */ - for (i = 0; i < len; i++) - data[i] = -1; - fd_list[list_idx].fds = data; - fd_list[list_idx].len = len; - } else { - fd_list[list_idx].fds = NULL; - fd_list[list_idx].len = 0; - } - - fd_list[list_idx].count = 0; - fd_list[list_idx].memseg_list_fd = -1; - - return 0; -} - -static int -fd_list_create_walk(const struct rte_memseg_list *msl, - void *arg __rte_unused) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - unsigned int len; - int msl_idx; - - if (msl->external) - return 0; - - msl_idx = msl - mcfg->memsegs; - len = msl->memseg_arr.len; - - return alloc_list(msl_idx, len); -} - -int -eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - - /* single file segments mode doesn't support individual segment fd's */ - if (internal_config.single_file_segments) - return -ENOTSUP; - - /* if list is not allocated, allocate it */ - if (fd_list[list_idx].len == 0) { - int len = mcfg->memsegs[list_idx].memseg_arr.len; - - if (alloc_list(list_idx, len) < 0) - return -ENOMEM; - } - fd_list[list_idx].fds[seg_idx] = fd; - - return 0; -} - -int -eal_memalloc_set_seg_list_fd(int list_idx, int fd) -{ - /* non-single file segment mode doesn't support segment list fd's */ - if (!internal_config.single_file_segments) - return -ENOTSUP; - - fd_list[list_idx].memseg_list_fd = fd; - - return 0; -} - -int -eal_memalloc_get_seg_fd(int list_idx, int seg_idx) -{ - int fd; - - if (internal_config.in_memory || internal_config.no_hugetlbfs) { -#ifndef MEMFD_SUPPORTED - /* in in-memory or no-huge mode, we rely on memfd support */ - return -ENOTSUP; -#endif - /* memfd supported, but hugetlbfs memfd may not be */ - if (!internal_config.no_hugetlbfs && !memfd_create_supported) - return -ENOTSUP; - } - - if (internal_config.single_file_segments) { - fd = fd_list[list_idx].memseg_list_fd; - } else if (fd_list[list_idx].len == 0) { - /* list not initialized */ - fd = -1; - } else { - fd = fd_list[list_idx].fds[seg_idx]; - } - if (fd < 0) - return -ENODEV; - return fd; -} - -static int -test_memfd_create(void) -{ -#ifdef MEMFD_SUPPORTED - unsigned int i; - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; - int pagesz_flag = pagesz_flags(pagesz); - int flags; - - flags = pagesz_flag | RTE_MFD_HUGETLB; - int fd = memfd_create("test", flags); - if (fd < 0) { - /* we failed - let memalloc know this isn't working */ - if (errno == EINVAL) { - memfd_create_supported = 0; - return 0; /* not supported */ - } - - /* we got other error - something's wrong */ - return -1; /* error */ - } - close(fd); - return 1; /* supported */ - } -#endif - return 0; /* not supported */ -} - -int -eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - - if (internal_config.in_memory || internal_config.no_hugetlbfs) { -#ifndef MEMFD_SUPPORTED - /* in in-memory or no-huge mode, we rely on memfd support */ - return -ENOTSUP; -#endif - /* memfd supported, but hugetlbfs memfd may not be */ - if (!internal_config.no_hugetlbfs && !memfd_create_supported) - return -ENOTSUP; - } - - if (internal_config.single_file_segments) { - size_t pgsz = mcfg->memsegs[list_idx].page_sz; - - /* segment not active? */ - if (fd_list[list_idx].memseg_list_fd < 0) - return -ENOENT; - *offset = pgsz * seg_idx; - } else { - /* fd_list not initialized? */ - if (fd_list[list_idx].len == 0) - return -ENODEV; - - /* segment not active? */ - if (fd_list[list_idx].fds[seg_idx] < 0) - return -ENOENT; - *offset = 0; - } - return 0; -} - -int -eal_memalloc_init(void) -{ - if (rte_eal_process_type() == RTE_PROC_SECONDARY) - if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) - return -1; - if (rte_eal_process_type() == RTE_PROC_PRIMARY && - internal_config.in_memory) { - int mfd_res = test_memfd_create(); - - if (mfd_res < 0) { - RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); - return -1; - } - if (mfd_res == 1) - RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); - else - RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); - - /* we only support single-file segments mode with in-memory mode - * if we support hugetlbfs with memfd_create. this code will - * test if we do. - */ - if (internal_config.single_file_segments && - mfd_res != 1) { - RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); - return -1; - } - /* this cannot ever happen but better safe than sorry */ - if (!anonymous_hugepages_supported) { - RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); - return -1; - } - } - - /* initialize all of the fd lists */ - if (rte_memseg_list_walk(fd_list_create_walk, NULL)) - return -1; - return 0; -} diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c deleted file mode 100644 index 7a9c97ff88..0000000000 --- a/lib/librte_eal/linux/eal/eal_memory.c +++ /dev/null @@ -1,2481 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation. - * Copyright(c) 2013 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ -#include -#define MEMFD_SUPPORTED -#endif -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_memalloc.h" -#include "eal_memcfg.h" -#include "eal_internal_cfg.h" -#include "eal_filesystem.h" -#include "eal_hugepages.h" -#include "eal_options.h" - -#define PFN_MASK_SIZE 8 - -/** - * @file - * Huge page mapping under linux - * - * To reserve a big contiguous amount of memory, we use the hugepage - * feature of linux. For that, we need to have hugetlbfs mounted. This - * code will create many files in this directory (one per page) and - * map them in virtual memory. For each page, we will retrieve its - * physical address and remap it in order to have a virtual contiguous - * zone as well as a physical contiguous zone. - */ - -static int phys_addrs_available = -1; - -#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" - -uint64_t eal_get_baseaddr(void) -{ - /* - * Linux kernel uses a really high address as starting address for - * serving mmaps calls. If there exists addressing limitations and IOVA - * mode is VA, this starting address is likely too high for those - * devices. However, it is possible to use a lower address in the - * process virtual address space as with 64 bits there is a lot of - * available space. - * - * Current known limitations are 39 or 40 bits. Setting the starting - * address at 4GB implies there are 508GB or 1020GB for mapping the - * available hugepages. This is likely enough for most systems, although - * a device with addressing limitations should call - * rte_mem_check_dma_mask for ensuring all memory is within supported - * range. - */ - return 0x100000000ULL; -} - -/* - * Get physical address of any mapped virtual address in the current process. - */ -phys_addr_t -rte_mem_virt2phy(const void *virtaddr) -{ - int fd, retval; - uint64_t page, physaddr; - unsigned long virt_pfn; - int page_size; - off_t offset; - - if (phys_addrs_available == 0) - return RTE_BAD_IOVA; - - /* standard page size */ - page_size = getpagesize(); - - fd = open("/proc/self/pagemap", O_RDONLY); - if (fd < 0) { - RTE_LOG(INFO, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - return RTE_BAD_IOVA; - } - - virt_pfn = (unsigned long)virtaddr / page_size; - offset = sizeof(uint64_t) * virt_pfn; - if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { - RTE_LOG(INFO, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - close(fd); - return RTE_BAD_IOVA; - } - - retval = read(fd, &page, PFN_MASK_SIZE); - close(fd); - if (retval < 0) { - RTE_LOG(INFO, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", - __func__, strerror(errno)); - return RTE_BAD_IOVA; - } else if (retval != PFN_MASK_SIZE) { - RTE_LOG(INFO, EAL, "%s(): read %d bytes from /proc/self/pagemap " - "but expected %d:\n", - __func__, retval, PFN_MASK_SIZE); - return RTE_BAD_IOVA; - } - - /* - * the pfn (page frame number) are bits 0-54 (see - * pagemap.txt in linux Documentation) - */ - if ((page & 0x7fffffffffffffULL) == 0) - return RTE_BAD_IOVA; - - physaddr = ((page & 0x7fffffffffffffULL) * page_size) - + ((unsigned long)virtaddr % page_size); - - return physaddr; -} - -rte_iova_t -rte_mem_virt2iova(const void *virtaddr) -{ - if (rte_eal_iova_mode() == RTE_IOVA_VA) - return (uintptr_t)virtaddr; - return rte_mem_virt2phy(virtaddr); -} - -/* - * For each hugepage in hugepg_tbl, fill the physaddr value. We find - * it by browsing the /proc/self/pagemap special file. - */ -static int -find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - unsigned int i; - phys_addr_t addr; - - for (i = 0; i < hpi->num_pages[0]; i++) { - addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); - if (addr == RTE_BAD_PHYS_ADDR) - return -1; - hugepg_tbl[i].physaddr = addr; - } - return 0; -} - -/* - * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. - */ -static int -set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - unsigned int i; - static phys_addr_t addr; - - for (i = 0; i < hpi->num_pages[0]; i++) { - hugepg_tbl[i].physaddr = addr; - addr += hugepg_tbl[i].size; - } - return 0; -} - -/* - * Check whether address-space layout randomization is enabled in - * the kernel. This is important for multi-process as it can prevent - * two processes mapping data to the same virtual address - * Returns: - * 0 - address space randomization disabled - * 1/2 - address space randomization enabled - * negative error code on error - */ -static int -aslr_enabled(void) -{ - char c; - int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); - if (fd < 0) - return -errno; - retval = read(fd, &c, 1); - close(fd); - if (retval < 0) - return -errno; - if (retval == 0) - return -EIO; - switch (c) { - case '0' : return 0; - case '1' : return 1; - case '2' : return 2; - default: return -EINVAL; - } -} - -static sigjmp_buf huge_jmpenv; - -static void huge_sigbus_handler(int signo __rte_unused) -{ - siglongjmp(huge_jmpenv, 1); -} - -/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, - * non-static local variable in the stack frame calling sigsetjmp might be - * clobbered by a call to longjmp. - */ -static int huge_wrap_sigsetjmp(void) -{ - return sigsetjmp(huge_jmpenv, 1); -} - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES -/* Callback for numa library. */ -void numa_error(char *where) -{ - RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); -} -#endif - -/* - * Mmap all hugepages of hugepage table: it first open a file in - * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the - * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored - * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to - * map contiguous physical blocks in contiguous virtual blocks. - */ -static unsigned -map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, - uint64_t *essential_memory __rte_unused) -{ - int fd; - unsigned i; - void *virtaddr; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - int node_id = -1; - int essential_prev = 0; - int oldpolicy; - struct bitmask *oldmask = NULL; - bool have_numa = true; - unsigned long maxnode = 0; - - /* Check if kernel supports NUMA. */ - if (numa_available() != 0) { - RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); - have_numa = false; - } - - if (have_numa) { - RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); - oldmask = numa_allocate_nodemask(); - if (get_mempolicy(&oldpolicy, oldmask->maskp, - oldmask->size + 1, 0, 0) < 0) { - RTE_LOG(ERR, EAL, - "Failed to get current mempolicy: %s. " - "Assuming MPOL_DEFAULT.\n", strerror(errno)); - oldpolicy = MPOL_DEFAULT; - } - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - if (internal_config.socket_mem[i]) - maxnode = i + 1; - } -#endif - - for (i = 0; i < hpi->num_pages[0]; i++) { - struct hugepage_file *hf = &hugepg_tbl[i]; - uint64_t hugepage_sz = hpi->hugepage_sz; - -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) { - unsigned int j; - - for (j = 0; j < maxnode; j++) - if (essential_memory[j]) - break; - - if (j == maxnode) { - node_id = (node_id + 1) % maxnode; - while (!internal_config.socket_mem[node_id]) { - node_id++; - node_id %= maxnode; - } - essential_prev = 0; - } else { - node_id = j; - essential_prev = essential_memory[j]; - - if (essential_memory[j] < hugepage_sz) - essential_memory[j] = 0; - else - essential_memory[j] -= hugepage_sz; - } - - RTE_LOG(DEBUG, EAL, - "Setting policy MPOL_PREFERRED for socket %d\n", - node_id); - numa_set_preferred(node_id); - } -#endif - - hf->file_id = i; - hf->size = hugepage_sz; - eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), - hpi->hugedir, hf->file_id); - hf->filepath[sizeof(hf->filepath) - 1] = '\0'; - - /* try to create hugepage file */ - fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); - if (fd < 0) { - RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, - strerror(errno)); - goto out; - } - - /* map the segment, and populate page tables, - * the kernel fills this segment with zeros. we don't care where - * this gets mapped - we already have contiguous memory areas - * ready for us to map into. - */ - virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, fd, 0); - if (virtaddr == MAP_FAILED) { - RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, - strerror(errno)); - close(fd); - goto out; - } - - hf->orig_va = virtaddr; - - /* In linux, hugetlb limitations, like cgroup, are - * enforced at fault time instead of mmap(), even - * with the option of MAP_POPULATE. Kernel will send - * a SIGBUS signal. To avoid to be killed, save stack - * environment here, if SIGBUS happens, we can jump - * back here. - */ - if (huge_wrap_sigsetjmp()) { - RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " - "hugepages of size %u MB\n", - (unsigned int)(hugepage_sz / 0x100000)); - munmap(virtaddr, hugepage_sz); - close(fd); - unlink(hugepg_tbl[i].filepath); -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) - essential_memory[node_id] = - essential_prev; -#endif - goto out; - } - *(int *)virtaddr = 0; - - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", - __func__, strerror(errno)); - close(fd); - goto out; - } - - close(fd); - } - -out: -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (maxnode) { - RTE_LOG(DEBUG, EAL, - "Restoring previous memory policy: %d\n", oldpolicy); - if (oldpolicy == MPOL_DEFAULT) { - numa_set_localalloc(); - } else if (set_mempolicy(oldpolicy, oldmask->maskp, - oldmask->size + 1) < 0) { - RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", - strerror(errno)); - numa_set_localalloc(); - } - } - if (oldmask != NULL) - numa_free_cpumask(oldmask); -#endif - return i; -} - -/* - * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge - * page. - */ -static int -find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) -{ - int socket_id; - char *end, *nodestr; - unsigned i, hp_count = 0; - uint64_t virt_addr; - char buf[BUFSIZ]; - char hugedir_str[PATH_MAX]; - FILE *f; - - f = fopen("/proc/self/numa_maps", "r"); - if (f == NULL) { - RTE_LOG(NOTICE, EAL, "NUMA support not available" - " consider that all memory is in socket_id 0\n"); - return 0; - } - - snprintf(hugedir_str, sizeof(hugedir_str), - "%s/%s", hpi->hugedir, eal_get_hugefile_prefix()); - - /* parse numa map */ - while (fgets(buf, sizeof(buf), f) != NULL) { - - /* ignore non huge page */ - if (strstr(buf, " huge ") == NULL && - strstr(buf, hugedir_str) == NULL) - continue; - - /* get zone addr */ - virt_addr = strtoull(buf, &end, 16); - if (virt_addr == 0 || end == buf) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - - /* get node id (socket id) */ - nodestr = strstr(buf, " N"); - if (nodestr == NULL) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - nodestr += 2; - end = strstr(nodestr, "="); - if (end == NULL) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - end[0] = '\0'; - end = NULL; - - socket_id = strtoul(nodestr, &end, 0); - if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { - RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); - goto error; - } - - /* if we find this page in our mappings, set socket_id */ - for (i = 0; i < hpi->num_pages[0]; i++) { - void *va = (void *)(unsigned long)virt_addr; - if (hugepg_tbl[i].orig_va == va) { - hugepg_tbl[i].socket_id = socket_id; - hp_count++; -#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES - RTE_LOG(DEBUG, EAL, - "Hugepage %s is on socket %d\n", - hugepg_tbl[i].filepath, socket_id); -#endif - } - } - } - - if (hp_count < hpi->num_pages[0]) - goto error; - - fclose(f); - return 0; - -error: - fclose(f); - return -1; -} - -static int -cmp_physaddr(const void *a, const void *b) -{ -#ifndef RTE_ARCH_PPC_64 - const struct hugepage_file *p1 = a; - const struct hugepage_file *p2 = b; -#else - /* PowerPC needs memory sorted in reverse order from x86 */ - const struct hugepage_file *p1 = b; - const struct hugepage_file *p2 = a; -#endif - if (p1->physaddr < p2->physaddr) - return -1; - else if (p1->physaddr > p2->physaddr) - return 1; - else - return 0; -} - -/* - * Uses mmap to create a shared memory area for storage of data - * Used in this file to store the hugepage file map on disk - */ -static void * -create_shared_memory(const char *filename, const size_t mem_size) -{ - void *retval; - int fd; - - /* if no shared files mode is used, create anonymous memory instead */ - if (internal_config.no_shconf) { - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (retval == MAP_FAILED) - return NULL; - return retval; - } - - fd = open(filename, O_CREAT | O_RDWR, 0600); - if (fd < 0) - return NULL; - if (ftruncate(fd, mem_size) < 0) { - close(fd); - return NULL; - } - retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); - close(fd); - if (retval == MAP_FAILED) - return NULL; - return retval; -} - -/* - * this copies *active* hugepages from one hugepage table to another. - * destination is typically the shared memory. - */ -static int -copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, - const struct hugepage_file * src, int src_size) -{ - int src_pos, dst_pos = 0; - - for (src_pos = 0; src_pos < src_size; src_pos++) { - if (src[src_pos].orig_va != NULL) { - /* error on overflow attempt */ - if (dst_pos == dest_size) - return -1; - memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); - dst_pos++; - } - } - return 0; -} - -static int -unlink_hugepage_files(struct hugepage_file *hugepg_tbl, - unsigned num_hp_info) -{ - unsigned socket, size; - int page, nrpages = 0; - - /* get total number of hugepages */ - for (size = 0; size < num_hp_info; size++) - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) - nrpages += - internal_config.hugepage_info[size].num_pages[socket]; - - for (page = 0; page < nrpages; page++) { - struct hugepage_file *hp = &hugepg_tbl[page]; - - if (hp->orig_va != NULL && unlink(hp->filepath)) { - RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", - __func__, hp->filepath, strerror(errno)); - } - } - return 0; -} - -/* - * unmaps hugepages that are not going to be used. since we originally allocate - * ALL hugepages (not just those we need), additional unmapping needs to be done. - */ -static int -unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, - struct hugepage_info *hpi, - unsigned num_hp_info) -{ - unsigned socket, size; - int page, nrpages = 0; - - /* get total number of hugepages */ - for (size = 0; size < num_hp_info; size++) - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) - nrpages += internal_config.hugepage_info[size].num_pages[socket]; - - for (size = 0; size < num_hp_info; size++) { - for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { - unsigned pages_found = 0; - - /* traverse until we have unmapped all the unused pages */ - for (page = 0; page < nrpages; page++) { - struct hugepage_file *hp = &hugepg_tbl[page]; - - /* find a page that matches the criteria */ - if ((hp->size == hpi[size].hugepage_sz) && - (hp->socket_id == (int) socket)) { - - /* if we skipped enough pages, unmap the rest */ - if (pages_found == hpi[size].num_pages[socket]) { - uint64_t unmap_len; - - unmap_len = hp->size; - - /* get start addr and len of the remaining segment */ - munmap(hp->orig_va, - (size_t)unmap_len); - - hp->orig_va = NULL; - if (unlink(hp->filepath) == -1) { - RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", - __func__, hp->filepath, strerror(errno)); - return -1; - } - } else { - /* lock the page and skip */ - pages_found++; - } - - } /* match page */ - } /* foreach page */ - } /* foreach socket */ - } /* foreach pagesize */ - - return 0; -} - -static int -remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct rte_memseg_list *msl; - struct rte_fbarray *arr; - int cur_page, seg_len; - unsigned int msl_idx; - int ms_idx; - uint64_t page_sz; - size_t memseg_len; - int socket_id; - - page_sz = hugepages[seg_start].size; - socket_id = hugepages[seg_start].socket_id; - seg_len = seg_end - seg_start; - - RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", - (seg_len * page_sz) >> 20ULL, socket_id); - - /* find free space in memseg lists */ - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - bool empty; - msl = &mcfg->memsegs[msl_idx]; - arr = &msl->memseg_arr; - - if (msl->page_sz != page_sz) - continue; - if (msl->socket_id != socket_id) - continue; - - /* leave space for a hole if array is not empty */ - empty = arr->count == 0; - ms_idx = rte_fbarray_find_next_n_free(arr, 0, - seg_len + (empty ? 0 : 1)); - - /* memseg list is full? */ - if (ms_idx < 0) - continue; - - /* leave some space between memsegs, they are not IOVA - * contiguous, so they shouldn't be VA contiguous either. - */ - if (!empty) - ms_idx++; - break; - } - if (msl_idx == RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), - RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); - return -1; - } - -#ifdef RTE_ARCH_PPC_64 - /* for PPC64 we go through the list backwards */ - for (cur_page = seg_end - 1; cur_page >= seg_start; - cur_page--, ms_idx++) { -#else - for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { -#endif - struct hugepage_file *hfile = &hugepages[cur_page]; - struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); - void *addr; - int fd; - - fd = open(hfile->filepath, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", - hfile->filepath, strerror(errno)); - return -1; - } - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", - hfile->filepath, strerror(errno)); - close(fd); - return -1; - } - memseg_len = (size_t)page_sz; - addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); - - /* we know this address is already mmapped by memseg list, so - * using MAP_FIXED here is safe - */ - addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); - if (addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", - hfile->filepath, strerror(errno)); - close(fd); - return -1; - } - - /* we have a new address, so unmap previous one */ -#ifndef RTE_ARCH_64 - /* in 32-bit legacy mode, we have already unmapped the page */ - if (!internal_config.legacy_mem) - munmap(hfile->orig_va, page_sz); -#else - munmap(hfile->orig_va, page_sz); -#endif - - hfile->orig_va = NULL; - hfile->final_va = addr; - - /* rewrite physical addresses in IOVA as VA mode */ - if (rte_eal_iova_mode() == RTE_IOVA_VA) - hfile->physaddr = (uintptr_t)addr; - - /* set up memseg data */ - ms->addr = addr; - ms->hugepage_sz = page_sz; - ms->len = memseg_len; - ms->iova = hfile->physaddr; - ms->socket_id = hfile->socket_id; - ms->nchannel = rte_memory_get_nchannel(); - ms->nrank = rte_memory_get_nrank(); - - rte_fbarray_set_used(arr, ms_idx); - - /* store segment fd internally */ - if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) - RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", - rte_strerror(rte_errno)); - } - RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", - (seg_len * page_sz) >> 20, socket_id); - return 0; -} - -static uint64_t -get_mem_amount(uint64_t page_sz, uint64_t max_mem) -{ - uint64_t area_sz, max_pages; - - /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ - max_pages = RTE_MAX_MEMSEG_PER_LIST; - max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); - - area_sz = RTE_MIN(page_sz * max_pages, max_mem); - - /* make sure the list isn't smaller than the page size */ - area_sz = RTE_MAX(area_sz, page_sz); - - return RTE_ALIGN(area_sz, page_sz); -} - -static int -free_memseg_list(struct rte_memseg_list *msl) -{ - if (rte_fbarray_destroy(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); - return -1; - } - memset(msl, 0, sizeof(*msl)); - return 0; -} - -#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" -static int -alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, - int n_segs, int socket_id, int type_msl_idx) -{ - char name[RTE_FBARRAY_NAME_LEN]; - - snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, - type_msl_idx); - if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", - rte_strerror(rte_errno)); - return -1; - } - - msl->page_sz = page_sz; - msl->socket_id = socket_id; - msl->base_va = NULL; - msl->heap = 1; /* mark it as a heap segment */ - - RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", - (size_t)page_sz >> 10, socket_id); - - return 0; -} - -static int -alloc_va_space(struct rte_memseg_list *msl) -{ - uint64_t page_sz; - size_t mem_sz; - void *addr; - int flags = 0; - - page_sz = msl->page_sz; - mem_sz = page_sz * msl->memseg_arr.len; - - addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); - if (addr == NULL) { - if (rte_errno == EADDRNOTAVAIL) - RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " - "please use '--" OPT_BASE_VIRTADDR "' option\n", - (unsigned long long)mem_sz, msl->base_va); - else - RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); - return -1; - } - msl->base_va = addr; - msl->len = mem_sz; - - return 0; -} - -/* - * Our VA space is not preallocated yet, so preallocate it here. We need to know - * how many segments there are in order to map all pages into one address space, - * and leave appropriate holes between segments so that rte_malloc does not - * concatenate them into one big segment. - * - * we also need to unmap original pages to free up address space. - */ -static int __rte_unused -prealloc_segments(struct hugepage_file *hugepages, int n_pages) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int cur_page, seg_start_page, end_seg, new_memseg; - unsigned int hpi_idx, socket, i; - int n_contig_segs, n_segs; - int msl_idx; - - /* before we preallocate segments, we need to free up our VA space. - * we're not removing files, and we already have information about - * PA-contiguousness, so it is safe to unmap everything. - */ - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *hpi = &hugepages[cur_page]; - munmap(hpi->orig_va, hpi->size); - hpi->orig_va = NULL; - } - - /* we cannot know how many page sizes and sockets we have discovered, so - * loop over all of them - */ - for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; - hpi_idx++) { - uint64_t page_sz = - internal_config.hugepage_info[hpi_idx].hugepage_sz; - - for (i = 0; i < rte_socket_count(); i++) { - struct rte_memseg_list *msl; - - socket = rte_socket_id_by_idx(i); - n_contig_segs = 0; - n_segs = 0; - seg_start_page = -1; - - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *prev, *cur; - int prev_seg_start_page = -1; - - cur = &hugepages[cur_page]; - prev = cur_page == 0 ? NULL : - &hugepages[cur_page - 1]; - - new_memseg = 0; - end_seg = 0; - - if (cur->size == 0) - end_seg = 1; - else if (cur->socket_id != (int) socket) - end_seg = 1; - else if (cur->size != page_sz) - end_seg = 1; - else if (cur_page == 0) - new_memseg = 1; -#ifdef RTE_ARCH_PPC_64 - /* On PPC64 architecture, the mmap always start - * from higher address to lower address. Here, - * physical addresses are in descending order. - */ - else if ((prev->physaddr - cur->physaddr) != - cur->size) - new_memseg = 1; -#else - else if ((cur->physaddr - prev->physaddr) != - cur->size) - new_memseg = 1; -#endif - if (new_memseg) { - /* if we're already inside a segment, - * new segment means end of current one - */ - if (seg_start_page != -1) { - end_seg = 1; - prev_seg_start_page = - seg_start_page; - } - seg_start_page = cur_page; - } - - if (end_seg) { - if (prev_seg_start_page != -1) { - /* we've found a new segment */ - n_contig_segs++; - n_segs += cur_page - - prev_seg_start_page; - } else if (seg_start_page != -1) { - /* we didn't find new segment, - * but did end current one - */ - n_contig_segs++; - n_segs += cur_page - - seg_start_page; - seg_start_page = -1; - continue; - } else { - /* we're skipping this page */ - continue; - } - } - /* segment continues */ - } - /* check if we missed last segment */ - if (seg_start_page != -1) { - n_contig_segs++; - n_segs += cur_page - seg_start_page; - } - - /* if no segments were found, do not preallocate */ - if (n_segs == 0) - continue; - - /* we now have total number of pages that we will - * allocate for this segment list. add separator pages - * to the total count, and preallocate VA space. - */ - n_segs += n_contig_segs - 1; - - /* now, preallocate VA space for these segments */ - - /* first, find suitable memseg list for this */ - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; - msl_idx++) { - msl = &mcfg->memsegs[msl_idx]; - - if (msl->base_va != NULL) - continue; - break; - } - if (msl_idx == RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - /* now, allocate fbarray itself */ - if (alloc_memseg_list(msl, page_sz, n_segs, socket, - msl_idx) < 0) - return -1; - - /* finally, allocate VA space */ - if (alloc_va_space(msl) < 0) - return -1; - } - } - return 0; -} - -/* - * We cannot reallocate memseg lists on the fly because PPC64 stores pages - * backwards, therefore we have to process the entire memseg first before - * remapping it into memseg list VA space. - */ -static int -remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) -{ - int cur_page, seg_start_page, new_memseg, ret; - - seg_start_page = 0; - for (cur_page = 0; cur_page < n_pages; cur_page++) { - struct hugepage_file *prev, *cur; - - new_memseg = 0; - - cur = &hugepages[cur_page]; - prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; - - /* if size is zero, no more pages left */ - if (cur->size == 0) - break; - - if (cur_page == 0) - new_memseg = 1; - else if (cur->socket_id != prev->socket_id) - new_memseg = 1; - else if (cur->size != prev->size) - new_memseg = 1; -#ifdef RTE_ARCH_PPC_64 - /* On PPC64 architecture, the mmap always start from higher - * address to lower address. Here, physical addresses are in - * descending order. - */ - else if ((prev->physaddr - cur->physaddr) != cur->size) - new_memseg = 1; -#else - else if ((cur->physaddr - prev->physaddr) != cur->size) - new_memseg = 1; -#endif - - if (new_memseg) { - /* if this isn't the first time, remap segment */ - if (cur_page != 0) { - ret = remap_segment(hugepages, seg_start_page, - cur_page); - if (ret != 0) - return -1; - } - /* remember where we started */ - seg_start_page = cur_page; - } - /* continuation of previous memseg */ - } - /* we were stopped, but we didn't remap the last segment, do it now */ - if (cur_page != 0) { - ret = remap_segment(hugepages, seg_start_page, - cur_page); - if (ret != 0) - return -1; - } - return 0; -} - -__rte_unused /* function is unused on 32-bit builds */ -static inline uint64_t -get_socket_mem_size(int socket) -{ - uint64_t size = 0; - unsigned i; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++){ - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - size += hpi->hugepage_sz * hpi->num_pages[socket]; - } - - return size; -} - -/* - * This function is a NUMA-aware equivalent of calc_num_pages. - * It takes in the list of hugepage sizes and the - * number of pages thereof, and calculates the best number of - * pages of each size to fulfill the request for ram - */ -static int -calc_num_pages_per_socket(uint64_t * memory, - struct hugepage_info *hp_info, - struct hugepage_info *hp_used, - unsigned num_hp_info) -{ - unsigned socket, j, i = 0; - unsigned requested, available; - int total_num_pages = 0; - uint64_t remaining_mem, cur_mem; - uint64_t total_mem = internal_config.memory; - - if (num_hp_info == 0) - return -1; - - /* if specific memory amounts per socket weren't requested */ - if (internal_config.force_sockets == 0) { - size_t total_size; -#ifdef RTE_ARCH_64 - int cpu_per_socket[RTE_MAX_NUMA_NODES]; - size_t default_size; - unsigned lcore_id; - - /* Compute number of cores per socket */ - memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); - RTE_LCORE_FOREACH(lcore_id) { - cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; - } - - /* - * Automatically spread requested memory amongst detected sockets according - * to number of cores from cpu mask present on each socket - */ - total_size = internal_config.memory; - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { - - /* Set memory amount per socket */ - default_size = (internal_config.memory * cpu_per_socket[socket]) - / rte_lcore_count(); - - /* Limit to maximum available memory on socket */ - default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); - - /* Update sizes */ - memory[socket] = default_size; - total_size -= default_size; - } - - /* - * If some memory is remaining, try to allocate it by getting all - * available memory from sockets, one after the other - */ - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { - /* take whatever is available */ - default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], - total_size); - - /* Update sizes */ - memory[socket] += default_size; - total_size -= default_size; - } -#else - /* in 32-bit mode, allocate all of the memory only on master - * lcore socket - */ - total_size = internal_config.memory; - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; - socket++) { - struct rte_config *cfg = rte_eal_get_configuration(); - unsigned int master_lcore_socket; - - master_lcore_socket = - rte_lcore_to_socket_id(cfg->master_lcore); - - if (master_lcore_socket != socket) - continue; - - /* Update sizes */ - memory[socket] = total_size; - break; - } -#endif - } - - for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { - /* skips if the memory on specific socket wasn't requested */ - for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ - strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, - sizeof(hp_used[i].hugedir)); - hp_used[i].num_pages[socket] = RTE_MIN( - memory[socket] / hp_info[i].hugepage_sz, - hp_info[i].num_pages[socket]); - - cur_mem = hp_used[i].num_pages[socket] * - hp_used[i].hugepage_sz; - - memory[socket] -= cur_mem; - total_mem -= cur_mem; - - total_num_pages += hp_used[i].num_pages[socket]; - - /* check if we have met all memory requests */ - if (memory[socket] == 0) - break; - - /* check if we have any more pages left at this size, if so - * move on to next size */ - if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) - continue; - /* At this point we know that there are more pages available that are - * bigger than the memory we want, so lets see if we can get enough - * from other page sizes. - */ - remaining_mem = 0; - for (j = i+1; j < num_hp_info; j++) - remaining_mem += hp_info[j].hugepage_sz * - hp_info[j].num_pages[socket]; - - /* is there enough other memory, if not allocate another page and quit */ - if (remaining_mem < memory[socket]){ - cur_mem = RTE_MIN(memory[socket], - hp_info[i].hugepage_sz); - memory[socket] -= cur_mem; - total_mem -= cur_mem; - hp_used[i].num_pages[socket]++; - total_num_pages++; - break; /* we are done with this socket*/ - } - } - /* if we didn't satisfy all memory requirements per socket */ - if (memory[socket] > 0 && - internal_config.socket_mem[socket] != 0) { - /* to prevent icc errors */ - requested = (unsigned) (internal_config.socket_mem[socket] / - 0x100000); - available = requested - - ((unsigned) (memory[socket] / 0x100000)); - RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " - "Requested: %uMB, available: %uMB\n", socket, - requested, available); - return -1; - } - } - - /* if we didn't satisfy total memory requirements */ - if (total_mem > 0) { - requested = (unsigned) (internal_config.memory / 0x100000); - available = requested - (unsigned) (total_mem / 0x100000); - RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," - " available: %uMB\n", requested, available); - return -1; - } - return total_num_pages; -} - -static inline size_t -eal_get_hugepage_mem_size(void) -{ - uint64_t size = 0; - unsigned i, j; - - for (i = 0; i < internal_config.num_hugepage_sizes; i++) { - struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - size += hpi->hugepage_sz * hpi->num_pages[j]; - } - } - } - - return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; -} - -static struct sigaction huge_action_old; -static int huge_need_recover; - -static void -huge_register_sigbus(void) -{ - sigset_t mask; - struct sigaction action; - - sigemptyset(&mask); - sigaddset(&mask, SIGBUS); - action.sa_flags = 0; - action.sa_mask = mask; - action.sa_handler = huge_sigbus_handler; - - huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); -} - -static void -huge_recover_sigbus(void) -{ - if (huge_need_recover) { - sigaction(SIGBUS, &huge_action_old, NULL); - huge_need_recover = 0; - } -} - -/* - * Prepare physical memory mapping: fill configuration structure with - * these infos, return 0 on success. - * 1. map N huge pages in separate files in hugetlbfs - * 2. find associated physical addr - * 3. find associated NUMA socket ID - * 4. sort all huge pages by physical address - * 5. remap these N huge pages in the correct order - * 6. unmap the first mapping - * 7. fill memsegs in configuration with contiguous zones - */ -static int -eal_legacy_hugepage_init(void) -{ - struct rte_mem_config *mcfg; - struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; - struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; - struct rte_fbarray *arr; - struct rte_memseg *ms; - - uint64_t memory[RTE_MAX_NUMA_NODES]; - - unsigned hp_offset; - int i, j; - int nr_hugefiles, nr_hugepages = 0; - void *addr; - - memset(used_hp, 0, sizeof(used_hp)); - - /* get pointer to global configuration */ - mcfg = rte_eal_get_configuration()->mem_config; - - /* hugetlbfs can be disabled */ - if (internal_config.no_hugetlbfs) { - void *prealloc_addr; - size_t mem_sz; - struct rte_memseg_list *msl; - int n_segs, cur_seg, fd, flags; -#ifdef MEMFD_SUPPORTED - int memfd; -#endif - uint64_t page_sz; - - /* nohuge mode is legacy mode */ - internal_config.legacy_mem = 1; - - /* nohuge mode is single-file segments mode */ - internal_config.single_file_segments = 1; - - /* create a memseg list */ - msl = &mcfg->memsegs[0]; - - page_sz = RTE_PGSIZE_4K; - n_segs = internal_config.memory / page_sz; - - if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, - sizeof(struct rte_memseg))) { - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - /* set up parameters for anonymous mmap */ - fd = -1; - flags = MAP_PRIVATE | MAP_ANONYMOUS; - -#ifdef MEMFD_SUPPORTED - /* create a memfd and store it in the segment fd table */ - memfd = memfd_create("nohuge", 0); - if (memfd < 0) { - RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n", - strerror(errno)); - RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n"); - } else { - /* we got an fd - now resize it */ - if (ftruncate(memfd, internal_config.memory) < 0) { - RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n", - strerror(errno)); - RTE_LOG(ERR, EAL, "Falling back to anonymous map\n"); - close(memfd); - } else { - /* creating memfd-backed file was successful. - * we want changes to memfd to be visible to - * other processes (such as vhost backend), so - * map it as shared memory. - */ - RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); - fd = memfd; - flags = MAP_SHARED; - } - } -#endif - /* preallocate address space for the memory, so that it can be - * fit into the DMA mask. - */ - mem_sz = internal_config.memory; - prealloc_addr = eal_get_virtual_area( - NULL, &mem_sz, page_sz, 0, 0); - if (prealloc_addr == NULL) { - RTE_LOG(ERR, EAL, - "%s: reserving memory area failed: " - "%s\n", - __func__, strerror(errno)); - return -1; - } - addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE, - flags | MAP_FIXED, fd, 0); - if (addr == MAP_FAILED || addr != prealloc_addr) { - RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, - strerror(errno)); - munmap(prealloc_addr, mem_sz); - return -1; - } - msl->base_va = addr; - msl->page_sz = page_sz; - msl->socket_id = 0; - msl->len = mem_sz; - msl->heap = 1; - - /* we're in single-file segments mode, so only the segment list - * fd needs to be set up. - */ - if (fd != -1) { - if (eal_memalloc_set_seg_list_fd(0, fd) < 0) { - RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n"); - /* not a serious error, proceed */ - } - } - - /* populate memsegs. each memseg is one page long */ - for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { - arr = &msl->memseg_arr; - - ms = rte_fbarray_get(arr, cur_seg); - if (rte_eal_iova_mode() == RTE_IOVA_VA) - ms->iova = (uintptr_t)addr; - else - ms->iova = RTE_BAD_IOVA; - ms->addr = addr; - ms->hugepage_sz = page_sz; - ms->socket_id = 0; - ms->len = page_sz; - - rte_fbarray_set_used(arr, cur_seg); - - addr = RTE_PTR_ADD(addr, (size_t)page_sz); - } - if (mcfg->dma_maskbits && - rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { - RTE_LOG(ERR, EAL, - "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", - __func__); - if (rte_eal_iova_mode() == RTE_IOVA_VA && - rte_eal_using_phys_addrs()) - RTE_LOG(ERR, EAL, - "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n", - __func__); - goto fail; - } - return 0; - } - - /* calculate total number of hugepages available. at this point we haven't - * yet started sorting them so they all are on socket 0 */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { - /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ - used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; - - nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; - } - - /* - * allocate a memory area for hugepage table. - * this isn't shared memory yet. due to the fact that we need some - * processing done on these pages, shared memory will be created - * at a later stage. - */ - tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); - if (tmp_hp == NULL) - goto fail; - - memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); - - hp_offset = 0; /* where we start the current page size entries */ - - huge_register_sigbus(); - - /* make a copy of socket_mem, needed for balanced allocation. */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - memory[i] = internal_config.socket_mem[i]; - - /* map all hugepages and sort them */ - for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ - unsigned pages_old, pages_new; - struct hugepage_info *hpi; - - /* - * we don't yet mark hugepages as used at this stage, so - * we just map all hugepages available to the system - * all hugepages are still located on socket 0 - */ - hpi = &internal_config.hugepage_info[i]; - - if (hpi->num_pages[0] == 0) - continue; - - /* map all hugepages available */ - pages_old = hpi->num_pages[0]; - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); - if (pages_new < pages_old) { - RTE_LOG(DEBUG, EAL, - "%d not %d hugepages of size %u MB allocated\n", - pages_new, pages_old, - (unsigned)(hpi->hugepage_sz / 0x100000)); - - int pages = pages_old - pages_new; - - nr_hugepages -= pages; - hpi->num_pages[0] = pages_new; - if (pages_new == 0) - continue; - } - - if (rte_eal_using_phys_addrs() && - rte_eal_iova_mode() != RTE_IOVA_VA) { - /* find physical addresses for each hugepage */ - if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { - RTE_LOG(DEBUG, EAL, "Failed to find phys addr " - "for %u MB pages\n", - (unsigned int)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - } else { - /* set physical addresses for each hugepage */ - if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { - RTE_LOG(DEBUG, EAL, "Failed to set phys addr " - "for %u MB pages\n", - (unsigned int)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - } - - if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ - RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", - (unsigned)(hpi->hugepage_sz / 0x100000)); - goto fail; - } - - qsort(&tmp_hp[hp_offset], hpi->num_pages[0], - sizeof(struct hugepage_file), cmp_physaddr); - - /* we have processed a num of hugepages of this size, so inc offset */ - hp_offset += hpi->num_pages[0]; - } - - huge_recover_sigbus(); - - if (internal_config.memory == 0 && internal_config.force_sockets == 0) - internal_config.memory = eal_get_hugepage_mem_size(); - - nr_hugefiles = nr_hugepages; - - - /* clean out the numbers of pages */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) - internal_config.hugepage_info[i].num_pages[j] = 0; - - /* get hugepages for each socket */ - for (i = 0; i < nr_hugefiles; i++) { - int socket = tmp_hp[i].socket_id; - - /* find a hugepage info with right size and increment num_pages */ - const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, - (int)internal_config.num_hugepage_sizes); - for (j = 0; j < nb_hpsizes; j++) { - if (tmp_hp[i].size == - internal_config.hugepage_info[j].hugepage_sz) { - internal_config.hugepage_info[j].num_pages[socket]++; - } - } - } - - /* make a copy of socket_mem, needed for number of pages calculation */ - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) - memory[i] = internal_config.socket_mem[i]; - - /* calculate final number of pages */ - nr_hugepages = calc_num_pages_per_socket(memory, - internal_config.hugepage_info, used_hp, - internal_config.num_hugepage_sizes); - - /* error if not enough memory available */ - if (nr_hugepages < 0) - goto fail; - - /* reporting in! */ - for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { - for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { - if (used_hp[i].num_pages[j] > 0) { - RTE_LOG(DEBUG, EAL, - "Requesting %u pages of size %uMB" - " from socket %i\n", - used_hp[i].num_pages[j], - (unsigned) - (used_hp[i].hugepage_sz / 0x100000), - j); - } - } - } - - /* create shared memory */ - hugepage = create_shared_memory(eal_hugepage_data_path(), - nr_hugefiles * sizeof(struct hugepage_file)); - - if (hugepage == NULL) { - RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); - goto fail; - } - memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); - - /* - * unmap pages that we won't need (looks at used_hp). - * also, sets final_va to NULL on pages that were unmapped. - */ - if (unmap_unneeded_hugepages(tmp_hp, used_hp, - internal_config.num_hugepage_sizes) < 0) { - RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); - goto fail; - } - - /* - * copy stuff from malloc'd hugepage* to the actual shared memory. - * this procedure only copies those hugepages that have orig_va - * not NULL. has overflow protection. - */ - if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, - tmp_hp, nr_hugefiles) < 0) { - RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); - goto fail; - } - -#ifndef RTE_ARCH_64 - /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ - if (internal_config.legacy_mem && - prealloc_segments(hugepage, nr_hugefiles)) { - RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); - goto fail; - } -#endif - - /* remap all pages we do need into memseg list VA space, so that those - * pages become first-class citizens in DPDK memory subsystem - */ - if (remap_needed_hugepages(hugepage, nr_hugefiles)) { - RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); - goto fail; - } - - /* free the hugepage backing files */ - if (internal_config.hugepage_unlink && - unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { - RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); - goto fail; - } - - /* free the temporary hugepage table */ - free(tmp_hp); - tmp_hp = NULL; - - munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - hugepage = NULL; - - /* we're not going to allocate more pages, so release VA space for - * unused memseg lists - */ - for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { - struct rte_memseg_list *msl = &mcfg->memsegs[i]; - size_t mem_sz; - - /* skip inactive lists */ - if (msl->base_va == NULL) - continue; - /* skip lists where there is at least one page allocated */ - if (msl->memseg_arr.count > 0) - continue; - /* this is an unused list, deallocate it */ - mem_sz = msl->len; - munmap(msl->base_va, mem_sz); - msl->base_va = NULL; - msl->heap = 0; - - /* destroy backing fbarray */ - rte_fbarray_destroy(&msl->memseg_arr); - } - - if (mcfg->dma_maskbits && - rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { - RTE_LOG(ERR, EAL, - "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", - __func__); - goto fail; - } - - return 0; - -fail: - huge_recover_sigbus(); - free(tmp_hp); - if (hugepage != NULL) - munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); - - return -1; -} - -static int __rte_unused -hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) -{ - struct hugepage_info *hpi = arg; - - if (msl->page_sz != hpi->hugepage_sz) - return 0; - - hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; - return 0; -} - -static int -limits_callback(int socket_id, size_t cur_limit, size_t new_len) -{ - RTE_SET_USED(socket_id); - RTE_SET_USED(cur_limit); - RTE_SET_USED(new_len); - return -1; -} - -static int -eal_hugepage_init(void) -{ - struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; - uint64_t memory[RTE_MAX_NUMA_NODES]; - int hp_sz_idx, socket_id; - - memset(used_hp, 0, sizeof(used_hp)); - - for (hp_sz_idx = 0; - hp_sz_idx < (int) internal_config.num_hugepage_sizes; - hp_sz_idx++) { -#ifndef RTE_ARCH_64 - struct hugepage_info dummy; - unsigned int i; -#endif - /* also initialize used_hp hugepage sizes in used_hp */ - struct hugepage_info *hpi; - hpi = &internal_config.hugepage_info[hp_sz_idx]; - used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; - -#ifndef RTE_ARCH_64 - /* for 32-bit, limit number of pages on socket to whatever we've - * preallocated, as we cannot allocate more. - */ - memset(&dummy, 0, sizeof(dummy)); - dummy.hugepage_sz = hpi->hugepage_sz; - if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) - return -1; - - for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { - hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], - dummy.num_pages[i]); - } -#endif - } - - /* make a copy of socket_mem, needed for balanced allocation. */ - for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) - memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; - - /* calculate final number of pages */ - if (calc_num_pages_per_socket(memory, - internal_config.hugepage_info, used_hp, - internal_config.num_hugepage_sizes) < 0) - return -1; - - for (hp_sz_idx = 0; - hp_sz_idx < (int)internal_config.num_hugepage_sizes; - hp_sz_idx++) { - for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; - socket_id++) { - struct rte_memseg **pages; - struct hugepage_info *hpi = &used_hp[hp_sz_idx]; - unsigned int num_pages = hpi->num_pages[socket_id]; - unsigned int num_pages_alloc; - - if (num_pages == 0) - continue; - - RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", - num_pages, hpi->hugepage_sz >> 20, socket_id); - - /* we may not be able to allocate all pages in one go, - * because we break up our memory map into multiple - * memseg lists. therefore, try allocating multiple - * times and see if we can get the desired number of - * pages from multiple allocations. - */ - - num_pages_alloc = 0; - do { - int i, cur_pages, needed; - - needed = num_pages - num_pages_alloc; - - pages = malloc(sizeof(*pages) * needed); - - /* do not request exact number of pages */ - cur_pages = eal_memalloc_alloc_seg_bulk(pages, - needed, hpi->hugepage_sz, - socket_id, false); - if (cur_pages <= 0) { - free(pages); - return -1; - } - - /* mark preallocated pages as unfreeable */ - for (i = 0; i < cur_pages; i++) { - struct rte_memseg *ms = pages[i]; - ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; - } - free(pages); - - num_pages_alloc += cur_pages; - } while (num_pages_alloc != num_pages); - } - } - /* if socket limits were specified, set them */ - if (internal_config.force_socket_limits) { - unsigned int i; - for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { - uint64_t limit = internal_config.socket_limit[i]; - if (limit == 0) - continue; - if (rte_mem_alloc_validator_register("socket-limit", - limits_callback, i, limit)) - RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); - } - } - return 0; -} - -/* - * uses fstat to report the size of a file on disk - */ -static off_t -getFileSize(int fd) -{ - struct stat st; - if (fstat(fd, &st) < 0) - return 0; - return st.st_size; -} - -/* - * This creates the memory mappings in the secondary process to match that of - * the server process. It goes through each memory segment in the DPDK runtime - * configuration and finds the hugepages which form that segment, mapping them - * in order to form a contiguous block in the virtual memory space - */ -static int -eal_legacy_hugepage_attach(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct hugepage_file *hp = NULL; - unsigned int num_hp = 0; - unsigned int i = 0; - unsigned int cur_seg; - off_t size = 0; - int fd, fd_hugepage = -1; - - if (aslr_enabled() > 0) { - RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " - "(ASLR) is enabled in the kernel.\n"); - RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " - "into secondary processes\n"); - } - - fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); - if (fd_hugepage < 0) { - RTE_LOG(ERR, EAL, "Could not open %s\n", - eal_hugepage_data_path()); - goto error; - } - - size = getFileSize(fd_hugepage); - hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); - if (hp == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Could not mmap %s\n", - eal_hugepage_data_path()); - goto error; - } - - num_hp = size / sizeof(struct hugepage_file); - RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); - - /* map all segments into memory to make sure we get the addrs. the - * segments themselves are already in memseg list (which is shared and - * has its VA space already preallocated), so we just need to map - * everything into correct addresses. - */ - for (i = 0; i < num_hp; i++) { - struct hugepage_file *hf = &hp[i]; - size_t map_sz = hf->size; - void *map_addr = hf->final_va; - int msl_idx, ms_idx; - struct rte_memseg_list *msl; - struct rte_memseg *ms; - - /* if size is zero, no more pages left */ - if (map_sz == 0) - break; - - fd = open(hf->filepath, O_RDWR); - if (fd < 0) { - RTE_LOG(ERR, EAL, "Could not open %s: %s\n", - hf->filepath, strerror(errno)); - goto error; - } - - map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_FIXED, fd, 0); - if (map_addr == MAP_FAILED) { - RTE_LOG(ERR, EAL, "Could not map %s: %s\n", - hf->filepath, strerror(errno)); - goto fd_error; - } - - /* set shared lock on the file. */ - if (flock(fd, LOCK_SH) < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", - __func__, strerror(errno)); - goto mmap_error; - } - - /* find segment data */ - msl = rte_mem_virt2memseg_list(map_addr); - if (msl == NULL) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", - __func__); - goto mmap_error; - } - ms = rte_mem_virt2memseg(map_addr, msl); - if (ms == NULL) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", - __func__); - goto mmap_error; - } - - msl_idx = msl - mcfg->memsegs; - ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); - if (ms_idx < 0) { - RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", - __func__); - goto mmap_error; - } - - /* store segment fd internally */ - if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) - RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", - rte_strerror(rte_errno)); - } - /* unmap the hugepage config file, since we are done using it */ - munmap(hp, size); - close(fd_hugepage); - return 0; - -mmap_error: - munmap(hp[i].final_va, hp[i].size); -fd_error: - close(fd); -error: - /* unwind mmap's done so far */ - for (cur_seg = 0; cur_seg < i; cur_seg++) - munmap(hp[cur_seg].final_va, hp[cur_seg].size); - - if (hp != NULL && hp != MAP_FAILED) - munmap(hp, size); - if (fd_hugepage >= 0) - close(fd_hugepage); - return -1; -} - -static int -eal_hugepage_attach(void) -{ - if (eal_memalloc_sync_with_primary()) { - RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); - if (aslr_enabled() > 0) - RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); - return -1; - } - return 0; -} - -int -rte_eal_hugepage_init(void) -{ - return internal_config.legacy_mem ? - eal_legacy_hugepage_init() : - eal_hugepage_init(); -} - -int -rte_eal_hugepage_attach(void) -{ - return internal_config.legacy_mem ? - eal_legacy_hugepage_attach() : - eal_hugepage_attach(); -} - -int -rte_eal_using_phys_addrs(void) -{ - if (phys_addrs_available == -1) { - uint64_t tmp = 0; - - if (rte_eal_has_hugepages() != 0 && - rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR) - phys_addrs_available = 1; - else - phys_addrs_available = 0; - } - return phys_addrs_available; -} - -static int __rte_unused -memseg_primary_init_32(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int active_sockets, hpi_idx, msl_idx = 0; - unsigned int socket_id, i; - struct rte_memseg_list *msl; - uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; - uint64_t max_mem; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* this is a giant hack, but desperate times call for desperate - * measures. in legacy 32-bit mode, we cannot preallocate VA space, - * because having upwards of 2 gigabytes of VA space already mapped will - * interfere with our ability to map and sort hugepages. - * - * therefore, in legacy 32-bit mode, we will be initializing memseg - * lists much later - in eal_memory.c, right after we unmap all the - * unneeded pages. this will not affect secondary processes, as those - * should be able to mmap the space without (too many) problems. - */ - if (internal_config.legacy_mem) - return 0; - - /* 32-bit mode is a very special case. we cannot know in advance where - * the user will want to allocate their memory, so we have to do some - * heuristics. - */ - active_sockets = 0; - total_requested_mem = 0; - if (internal_config.force_sockets) - for (i = 0; i < rte_socket_count(); i++) { - uint64_t mem; - - socket_id = rte_socket_id_by_idx(i); - mem = internal_config.socket_mem[socket_id]; - - if (mem == 0) - continue; - - active_sockets++; - total_requested_mem += mem; - } - else - total_requested_mem = internal_config.memory; - - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - if (total_requested_mem > max_mem) { - RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", - (unsigned int)(max_mem >> 20)); - return -1; - } - total_extra_mem = max_mem - total_requested_mem; - extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : - total_extra_mem / active_sockets; - - /* the allocation logic is a little bit convoluted, but here's how it - * works, in a nutshell: - * - if user hasn't specified on which sockets to allocate memory via - * --socket-mem, we allocate all of our memory on master core socket. - * - if user has specified sockets to allocate memory on, there may be - * some "unused" memory left (e.g. if user has specified --socket-mem - * such that not all memory adds up to 2 gigabytes), so add it to all - * sockets that are in use equally. - * - * page sizes are sorted by size in descending order, so we can safely - * assume that we dispense with bigger page sizes first. - */ - - /* create memseg lists */ - for (i = 0; i < rte_socket_count(); i++) { - int hp_sizes = (int) internal_config.num_hugepage_sizes; - uint64_t max_socket_mem, cur_socket_mem; - unsigned int master_lcore_socket; - struct rte_config *cfg = rte_eal_get_configuration(); - bool skip; - - socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - /* we can still sort pages by socket in legacy mode */ - if (!internal_config.legacy_mem && socket_id > 0) - break; -#endif - - /* if we didn't specifically request memory on this socket */ - skip = active_sockets != 0 && - internal_config.socket_mem[socket_id] == 0; - /* ...or if we didn't specifically request memory on *any* - * socket, and this is not master lcore - */ - master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); - skip |= active_sockets == 0 && socket_id != master_lcore_socket; - - if (skip) { - RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", - socket_id); - continue; - } - - /* max amount of memory on this socket */ - max_socket_mem = (active_sockets != 0 ? - internal_config.socket_mem[socket_id] : - internal_config.memory) + - extra_mem_per_socket; - cur_socket_mem = 0; - - for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { - uint64_t max_pagesz_mem, cur_pagesz_mem = 0; - uint64_t hugepage_sz; - struct hugepage_info *hpi; - int type_msl_idx, max_segs, total_segs = 0; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - /* check if pages are actually available */ - if (hpi->num_pages[socket_id] == 0) - continue; - - max_segs = RTE_MAX_MEMSEG_PER_TYPE; - max_pagesz_mem = max_socket_mem - cur_socket_mem; - - /* make it multiple of page size */ - max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, - hugepage_sz); - - RTE_LOG(DEBUG, EAL, "Attempting to preallocate " - "%" PRIu64 "M on socket %i\n", - max_pagesz_mem >> 20, socket_id); - - type_msl_idx = 0; - while (cur_pagesz_mem < max_pagesz_mem && - total_segs < max_segs) { - uint64_t cur_mem; - unsigned int n_segs; - - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - return -1; - } - - msl = &mcfg->memsegs[msl_idx]; - - cur_mem = get_mem_amount(hugepage_sz, - max_pagesz_mem); - n_segs = cur_mem / hugepage_sz; - - if (alloc_memseg_list(msl, hugepage_sz, n_segs, - socket_id, type_msl_idx)) { - /* failing to allocate a memseg list is - * a serious error. - */ - RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); - return -1; - } - - if (alloc_va_space(msl)) { - /* if we couldn't allocate VA space, we - * can try with smaller page sizes. - */ - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); - /* deallocate memseg list */ - if (free_memseg_list(msl)) - return -1; - break; - } - - total_segs += msl->memseg_arr.len; - cur_pagesz_mem = total_segs * hugepage_sz; - type_msl_idx++; - msl_idx++; - } - cur_socket_mem += cur_pagesz_mem; - } - if (cur_socket_mem == 0) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", - socket_id); - return -1; - } - } - - return 0; -} - -static int __rte_unused -memseg_primary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - struct memtype { - uint64_t page_sz; - int socket_id; - } *memtypes = NULL; - int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ - struct rte_memseg_list *msl; - uint64_t max_mem, max_mem_per_type; - unsigned int max_seglists_per_type; - unsigned int n_memtypes, cur_type; - - /* no-huge does not need this at all */ - if (internal_config.no_hugetlbfs) - return 0; - - /* - * figuring out amount of memory we're going to have is a long and very - * involved process. the basic element we're operating with is a memory - * type, defined as a combination of NUMA node ID and page size (so that - * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). - * - * deciding amount of memory going towards each memory type is a - * balancing act between maximum segments per type, maximum memory per - * type, and number of detected NUMA nodes. the goal is to make sure - * each memory type gets at least one memseg list. - * - * the total amount of memory is limited by RTE_MAX_MEM_MB value. - * - * the total amount of memory per type is limited by either - * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number - * of detected NUMA nodes. additionally, maximum number of segments per - * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for - * smaller page sizes, it can take hundreds of thousands of segments to - * reach the above specified per-type memory limits. - * - * additionally, each type may have multiple memseg lists associated - * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger - * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. - * - * the number of memseg lists per type is decided based on the above - * limits, and also taking number of detected NUMA nodes, to make sure - * that we don't run out of memseg lists before we populate all NUMA - * nodes with memory. - * - * we do this in three stages. first, we collect the number of types. - * then, we figure out memory constraints and populate the list of - * would-be memseg lists. then, we go ahead and allocate the memseg - * lists. - */ - - /* create space for mem types */ - n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); - memtypes = calloc(n_memtypes, sizeof(*memtypes)); - if (memtypes == NULL) { - RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); - return -1; - } - - /* populate mem types */ - cur_type = 0; - for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; - hpi_idx++) { - struct hugepage_info *hpi; - uint64_t hugepage_sz; - - hpi = &internal_config.hugepage_info[hpi_idx]; - hugepage_sz = hpi->hugepage_sz; - - for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { - int socket_id = rte_socket_id_by_idx(i); - -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - /* we can still sort pages by socket in legacy mode */ - if (!internal_config.legacy_mem && socket_id > 0) - break; -#endif - memtypes[cur_type].page_sz = hugepage_sz; - memtypes[cur_type].socket_id = socket_id; - - RTE_LOG(DEBUG, EAL, "Detected memory type: " - "socket_id:%u hugepage_sz:%" PRIu64 "\n", - socket_id, hugepage_sz); - } - } - /* number of memtypes could have been lower due to no NUMA support */ - n_memtypes = cur_type; - - /* set up limits for types */ - max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; - max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, - max_mem / n_memtypes); - /* - * limit maximum number of segment lists per type to ensure there's - * space for memseg lists for all NUMA nodes with all page sizes - */ - max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; - - if (max_seglists_per_type == 0) { - RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - goto out; - } - - /* go through all mem types and create segment lists */ - msl_idx = 0; - for (cur_type = 0; cur_type < n_memtypes; cur_type++) { - unsigned int cur_seglist, n_seglists, n_segs; - unsigned int max_segs_per_type, max_segs_per_list; - struct memtype *type = &memtypes[cur_type]; - uint64_t max_mem_per_list, pagesz; - int socket_id; - - pagesz = type->page_sz; - socket_id = type->socket_id; - - /* - * we need to create segment lists for this type. we must take - * into account the following things: - * - * 1. total amount of memory we can use for this memory type - * 2. total amount of memory per memseg list allowed - * 3. number of segments needed to fit the amount of memory - * 4. number of segments allowed per type - * 5. number of segments allowed per memseg list - * 6. number of memseg lists we are allowed to take up - */ - - /* calculate how much segments we will need in total */ - max_segs_per_type = max_mem_per_type / pagesz; - /* limit number of segments to maximum allowed per type */ - max_segs_per_type = RTE_MIN(max_segs_per_type, - (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); - /* limit number of segments to maximum allowed per list */ - max_segs_per_list = RTE_MIN(max_segs_per_type, - (unsigned int)RTE_MAX_MEMSEG_PER_LIST); - - /* calculate how much memory we can have per segment list */ - max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, - (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); - - /* calculate how many segments each segment list will have */ - n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); - - /* calculate how many segment lists we can have */ - n_seglists = RTE_MIN(max_segs_per_type / n_segs, - max_mem_per_type / max_mem_per_list); - - /* limit number of segment lists according to our maximum */ - n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); - - RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " - "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", - n_seglists, n_segs, socket_id, pagesz); - - /* create all segment lists */ - for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { - if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { - RTE_LOG(ERR, EAL, - "No more space in memseg lists, please increase %s\n", - RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); - goto out; - } - msl = &mcfg->memsegs[msl_idx++]; - - if (alloc_memseg_list(msl, pagesz, n_segs, - socket_id, cur_seglist)) - goto out; - - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); - goto out; - } - } - } - /* we're successful */ - ret = 0; -out: - free(memtypes); - return ret; -} - -static int -memseg_secondary_init(void) -{ - struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; - int msl_idx = 0; - struct rte_memseg_list *msl; - - for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { - - msl = &mcfg->memsegs[msl_idx]; - - /* skip empty memseg lists */ - if (msl->memseg_arr.len == 0) - continue; - - if (rte_fbarray_attach(&msl->memseg_arr)) { - RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); - return -1; - } - - /* preallocate VA space */ - if (alloc_va_space(msl)) { - RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); - return -1; - } - } - - return 0; -} - -int -rte_eal_memseg_init(void) -{ - /* increase rlimit to maximum */ - struct rlimit lim; - - if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { - /* set limit to maximum */ - lim.rlim_cur = lim.rlim_max; - - if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { - RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", - strerror(errno)); - } else { - RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" - PRIu64 "\n", - (uint64_t)lim.rlim_cur); - } - } else { - RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); - } -#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES - if (!internal_config.legacy_mem && rte_socket_count() > 1) { - RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n"); - RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n"); - RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n"); - } -#endif - - return rte_eal_process_type() == RTE_PROC_PRIMARY ? -#ifndef RTE_ARCH_64 - memseg_primary_init_32() : -#else - memseg_primary_init() : -#endif - memseg_secondary_init(); -} diff --git a/lib/librte_eal/linux/eal/eal_thread.c b/lib/librte_eal/linux/eal/eal_thread.c deleted file mode 100644 index 379773b683..0000000000 --- a/lib/librte_eal/linux/eal/eal_thread.c +++ /dev/null @@ -1,188 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_thread.h" - -RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; -RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; -RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); - -/* - * Send a message to a slave lcore identified by slave_id to call a - * function f with argument arg. Once the execution is done, the - * remote lcore switch in FINISHED state. - */ -int -rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) -{ - int n; - char c = 0; - int m2s = lcore_config[slave_id].pipe_master2slave[1]; - int s2m = lcore_config[slave_id].pipe_slave2master[0]; - - if (lcore_config[slave_id].state != WAIT) - return -EBUSY; - - lcore_config[slave_id].f = f; - lcore_config[slave_id].arg = arg; - - /* send message */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(m2s, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - /* wait ack */ - do { - n = read(s2m, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - return 0; -} - -/* set affinity for current EAL thread */ -static int -eal_thread_set_affinity(void) -{ - unsigned lcore_id = rte_lcore_id(); - - /* acquire system unique id */ - rte_gettid(); - - /* update EAL thread core affinity */ - return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); -} - -void eal_thread_init_master(unsigned lcore_id) -{ - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); -} - -/* main loop of threads */ -__attribute__((noreturn)) void * -eal_thread_loop(__attribute__((unused)) void *arg) -{ - char c; - int n, ret; - unsigned lcore_id; - pthread_t thread_id; - int m2s, s2m; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - - thread_id = pthread_self(); - - /* retrieve our lcore_id from the configuration structure */ - RTE_LCORE_FOREACH_SLAVE(lcore_id) { - if (thread_id == lcore_config[lcore_id].thread_id) - break; - } - if (lcore_id == RTE_MAX_LCORE) - rte_panic("cannot retrieve lcore id\n"); - - m2s = lcore_config[lcore_id].pipe_master2slave[0]; - s2m = lcore_config[lcore_id].pipe_slave2master[1]; - - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - /* set CPU affinity */ - if (eal_thread_set_affinity() < 0) - rte_panic("cannot set affinity\n"); - - ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); - - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", - lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); - - /* read on our pipe to get commands */ - while (1) { - void *fct_arg; - - /* wait command */ - do { - n = read(m2s, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - lcore_config[lcore_id].state = RUNNING; - - /* send ack */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = write(s2m, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - if (lcore_config[lcore_id].f == NULL) - rte_panic("NULL function pointer\n"); - - /* call the function and store the return value */ - fct_arg = lcore_config[lcore_id].arg; - ret = lcore_config[lcore_id].f(fct_arg); - lcore_config[lcore_id].ret = ret; - rte_wmb(); - - /* when a service core returns, it should go directly to WAIT - * state, because the application will not lcore_wait() for it. - */ - if (lcore_config[lcore_id].core_role == ROLE_SERVICE) - lcore_config[lcore_id].state = WAIT; - else - lcore_config[lcore_id].state = FINISHED; - } - - /* never reached */ - /* pthread_exit(NULL); */ - /* return NULL; */ -} - -/* require calling thread tid by gettid() */ -int rte_sys_gettid(void) -{ - return (int)syscall(SYS_gettid); -} - -int rte_thread_setname(pthread_t id, const char *name) -{ - int ret = ENOSYS; -#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 12) - ret = pthread_setname_np(id, name); -#endif -#endif - RTE_SET_USED(id); - RTE_SET_USED(name); - return -ret; -} diff --git a/lib/librte_eal/linux/eal/eal_timer.c b/lib/librte_eal/linux/eal/eal_timer.c deleted file mode 100644 index a904a8297c..0000000000 --- a/lib/librte_eal/linux/eal/eal_timer.c +++ /dev/null @@ -1,232 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation. - * Copyright(c) 2012-2013 6WIND S.A. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" -#include "eal_internal_cfg.h" - -enum timer_source eal_timer_source = EAL_TIMER_HPET; - -#ifdef RTE_LIBEAL_USE_HPET - -#define DEV_HPET "/dev/hpet" - -/* Maximum number of counters. */ -#define HPET_TIMER_NUM 3 - -/* General capabilities register */ -#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ -#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ - -/** - * HPET timer registers. From the Intel IA-PC HPET (High Precision Event - * Timers) Specification. - */ -struct eal_hpet_regs { - /* Memory-mapped, software visible registers */ - uint64_t capabilities; /**< RO General Capabilities Register. */ - uint64_t reserved0; /**< Reserved for future use. */ - uint64_t config; /**< RW General Configuration Register. */ - uint64_t reserved1; /**< Reserved for future use. */ - uint64_t isr; /**< RW Clear General Interrupt Status. */ - uint64_t reserved2[25]; /**< Reserved for future use. */ - union { - uint64_t counter; /**< RW Main Counter Value Register. */ - struct { - uint32_t counter_l; /**< RW Main Counter Low. */ - uint32_t counter_h; /**< RW Main Counter High. */ - }; - }; - uint64_t reserved3; /**< Reserved for future use. */ - struct { - uint64_t config; /**< RW Timer Config and Capability Reg. */ - uint64_t comp; /**< RW Timer Comparator Value Register. */ - uint64_t fsb; /**< RW FSB Interrupt Route Register. */ - uint64_t reserved4; /**< Reserved for future use. */ - } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ -}; - -/* Mmap'd hpet registers */ -static volatile struct eal_hpet_regs *eal_hpet = NULL; - -/* Period at which the HPET counter increments in - * femtoseconds (10^-15 seconds). */ -static uint32_t eal_hpet_resolution_fs = 0; - -/* Frequency of the HPET counter in Hz */ -static uint64_t eal_hpet_resolution_hz = 0; - -/* Incremented 4 times during one 32bits hpet full count */ -static uint32_t eal_hpet_msb; - -static pthread_t msb_inc_thread_id; - -/* - * This function runs on a specific thread to update a global variable - * containing used to process MSB of the HPET (unfortunately, we need - * this because hpet is 32 bits by default under linux). - */ -static void * -hpet_msb_inc(__attribute__((unused)) void *arg) -{ - uint32_t t; - - while (1) { - t = (eal_hpet->counter_l >> 30); - if (t != (eal_hpet_msb & 3)) - eal_hpet_msb ++; - sleep(10); - } - return NULL; -} - -uint64_t -rte_get_hpet_hz(void) -{ - if(internal_config.no_hpet) - rte_panic("Error, HPET called, but no HPET present\n"); - - return eal_hpet_resolution_hz; -} - -uint64_t -rte_get_hpet_cycles(void) -{ - uint32_t t, msb; - uint64_t ret; - - if(internal_config.no_hpet) - rte_panic("Error, HPET called, but no HPET present\n"); - - t = eal_hpet->counter_l; - msb = eal_hpet_msb; - ret = (msb + 2 - (t >> 30)) / 4; - ret <<= 32; - ret += t; - return ret; -} - -#endif - -#ifdef RTE_LIBEAL_USE_HPET -/* - * Open and mmap /dev/hpet (high precision event timer) that will - * provide our time reference. - */ -int -rte_eal_hpet_init(int make_default) -{ - int fd, ret; - - if (internal_config.no_hpet) { - RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); - return -1; - } - - fd = open(DEV_HPET, O_RDONLY); - if (fd < 0) { - RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", - strerror(errno)); - internal_config.no_hpet = 1; - return -1; - } - eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); - if (eal_hpet == MAP_FAILED) { - RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" - "Please enable CONFIG_HPET_MMAP in your kernel configuration " - "to allow HPET support.\n" - "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " - "in your build configuration or use '--no-hpet' EAL flag.\n"); - close(fd); - internal_config.no_hpet = 1; - return -1; - } - close(fd); - - eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & - CLK_PERIOD_MASK) >> - CLK_PERIOD_SHIFT); - - eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / - (uint64_t)eal_hpet_resolution_fs; - - RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", - eal_hpet_resolution_hz/1000); - - eal_hpet_msb = (eal_hpet->counter_l >> 30); - - /* create a thread that will increment a global variable for - * msb (hpet is 32 bits by default under linux) */ - ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, - hpet_msb_inc, NULL); - if (ret != 0) { - RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); - internal_config.no_hpet = 1; - return -1; - } - - if (make_default) - eal_timer_source = EAL_TIMER_HPET; - return 0; -} -#endif - -uint64_t -get_tsc_freq(void) -{ -#ifdef CLOCK_MONOTONIC_RAW -#define NS_PER_SEC 1E9 -#define CYC_PER_10MHZ 1E7 - - struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ - - struct timespec t_start, t_end; - uint64_t tsc_hz; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { - uint64_t ns, end, start = rte_rdtsc(); - nanosleep(&sleeptime,NULL); - clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); - end = rte_rdtsc(); - ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); - ns += (t_end.tv_nsec - t_start.tv_nsec); - - double secs = (double)ns/NS_PER_SEC; - tsc_hz = (uint64_t)((end - start)/secs); - /* Round up to 10Mhz. 1E7 ~ 10Mhz */ - return RTE_ALIGN_MUL_NEAR(tsc_hz, CYC_PER_10MHZ); - } -#endif - return 0; -} - -int -rte_eal_timer_init(void) -{ - - eal_timer_source = EAL_TIMER_TSC; - - set_tsc_freq(); - return 0; -} diff --git a/lib/librte_eal/linux/eal/eal_vfio.c b/lib/librte_eal/linux/eal/eal_vfio.c deleted file mode 100644 index 4502aefed3..0000000000 --- a/lib/librte_eal/linux/eal/eal_vfio.c +++ /dev/null @@ -1,2184 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "eal_filesystem.h" -#include "eal_memcfg.h" -#include "eal_vfio.h" -#include "eal_private.h" - -#ifdef VFIO_PRESENT - -#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" - -/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can - * recreate the mappings for DPDK segments, but we cannot do so for memory that - * was registered by the user themselves, so we need to store the user mappings - * somewhere, to recreate them later. - */ -#define VFIO_MAX_USER_MEM_MAPS 256 -struct user_mem_map { - uint64_t addr; - uint64_t iova; - uint64_t len; -}; - -struct user_mem_maps { - rte_spinlock_recursive_t lock; - int n_maps; - struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; -}; - -struct vfio_config { - int vfio_enabled; - int vfio_container_fd; - int vfio_active_groups; - const struct vfio_iommu_type *vfio_iommu_type; - struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; - struct user_mem_maps mem_maps; -}; - -/* per-process VFIO config */ -static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; -static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; - -static int vfio_type1_dma_map(int); -static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_spapr_dma_map(int); -static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_noiommu_dma_map(int); -static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); -static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, - uint64_t iova, uint64_t len, int do_map); - -/* IOMMU types we support */ -static const struct vfio_iommu_type iommu_types[] = { - /* x86 IOMMU, otherwise known as type 1 */ - { - .type_id = RTE_VFIO_TYPE1, - .name = "Type 1", - .dma_map_func = &vfio_type1_dma_map, - .dma_user_map_func = &vfio_type1_dma_mem_map - }, - /* ppc64 IOMMU, otherwise known as spapr */ - { - .type_id = RTE_VFIO_SPAPR, - .name = "sPAPR", - .dma_map_func = &vfio_spapr_dma_map, - .dma_user_map_func = &vfio_spapr_dma_mem_map - }, - /* IOMMU-less mode */ - { - .type_id = RTE_VFIO_NOIOMMU, - .name = "No-IOMMU", - .dma_map_func = &vfio_noiommu_dma_map, - .dma_user_map_func = &vfio_noiommu_dma_mem_map - }, -}; - -static int -is_null_map(const struct user_mem_map *map) -{ - return map->addr == 0 && map->iova == 0 && map->len == 0; -} - -/* we may need to merge user mem maps together in case of user mapping/unmapping - * chunks of memory, so we'll need a comparator function to sort segments. - */ -static int -user_mem_map_cmp(const void *a, const void *b) -{ - const struct user_mem_map *umm_a = a; - const struct user_mem_map *umm_b = b; - - /* move null entries to end */ - if (is_null_map(umm_a)) - return 1; - if (is_null_map(umm_b)) - return -1; - - /* sort by iova first */ - if (umm_a->iova < umm_b->iova) - return -1; - if (umm_a->iova > umm_b->iova) - return 1; - - if (umm_a->addr < umm_b->addr) - return -1; - if (umm_a->addr > umm_b->addr) - return 1; - - if (umm_a->len < umm_b->len) - return -1; - if (umm_a->len > umm_b->len) - return 1; - - return 0; -} - -/* adjust user map entry. this may result in shortening of existing map, or in - * splitting existing map in two pieces. - */ -static void -adjust_map(struct user_mem_map *src, struct user_mem_map *end, - uint64_t remove_va_start, uint64_t remove_len) -{ - /* if va start is same as start address, we're simply moving start */ - if (remove_va_start == src->addr) { - src->addr += remove_len; - src->iova += remove_len; - src->len -= remove_len; - } else if (remove_va_start + remove_len == src->addr + src->len) { - /* we're shrinking mapping from the end */ - src->len -= remove_len; - } else { - /* we're blowing a hole in the middle */ - struct user_mem_map tmp; - uint64_t total_len = src->len; - - /* adjust source segment length */ - src->len = remove_va_start - src->addr; - - /* create temporary segment in the middle */ - tmp.addr = src->addr + src->len; - tmp.iova = src->iova + src->len; - tmp.len = remove_len; - - /* populate end segment - this one we will be keeping */ - end->addr = tmp.addr + tmp.len; - end->iova = tmp.iova + tmp.len; - end->len = total_len - src->len - tmp.len; - } -} - -/* try merging two maps into one, return 1 if succeeded */ -static int -merge_map(struct user_mem_map *left, struct user_mem_map *right) -{ - if (left->addr + left->len != right->addr) - return 0; - if (left->iova + left->len != right->iova) - return 0; - - left->len += right->len; - - memset(right, 0, sizeof(*right)); - - return 1; -} - -static struct user_mem_map * -find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, - uint64_t iova, uint64_t len) -{ - uint64_t va_end = addr + len; - uint64_t iova_end = iova + len; - int i; - - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map = &user_mem_maps->maps[i]; - uint64_t map_va_end = map->addr + map->len; - uint64_t map_iova_end = map->iova + map->len; - - /* check start VA */ - if (addr < map->addr || addr >= map_va_end) - continue; - /* check if VA end is within boundaries */ - if (va_end <= map->addr || va_end > map_va_end) - continue; - - /* check start IOVA */ - if (iova < map->iova || iova >= map_iova_end) - continue; - /* check if IOVA end is within boundaries */ - if (iova_end <= map->iova || iova_end > map_iova_end) - continue; - - /* we've found our map */ - return map; - } - return NULL; -} - -/* this will sort all user maps, and merge/compact any adjacent maps */ -static void -compact_user_maps(struct user_mem_maps *user_mem_maps) -{ - int i, n_merged, cur_idx; - - qsort(user_mem_maps->maps, user_mem_maps->n_maps, - sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); - - /* we'll go over the list backwards when merging */ - n_merged = 0; - for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { - struct user_mem_map *l, *r; - - l = &user_mem_maps->maps[i]; - r = &user_mem_maps->maps[i + 1]; - - if (is_null_map(l) || is_null_map(r)) - continue; - - if (merge_map(l, r)) - n_merged++; - } - - /* the entries are still sorted, but now they have holes in them, so - * walk through the list and remove the holes - */ - if (n_merged > 0) { - cur_idx = 0; - for (i = 0; i < user_mem_maps->n_maps; i++) { - if (!is_null_map(&user_mem_maps->maps[i])) { - struct user_mem_map *src, *dst; - - src = &user_mem_maps->maps[i]; - dst = &user_mem_maps->maps[cur_idx++]; - - if (src != dst) { - memcpy(dst, src, sizeof(*src)); - memset(src, 0, sizeof(*src)); - } - } - } - user_mem_maps->n_maps = cur_idx; - } -} - -static int -vfio_open_group_fd(int iommu_group_num) -{ - int vfio_group_fd; - char filename[PATH_MAX]; - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply = {0}; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - /* if primary, try to open the group */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* try regular group format */ - snprintf(filename, sizeof(filename), - VFIO_GROUP_FMT, iommu_group_num); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - /* if file not found, it's not an error */ - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - - /* special case: try no-IOMMU path as well */ - snprintf(filename, sizeof(filename), - VFIO_NOIOMMU_GROUP_FMT, - iommu_group_num); - vfio_group_fd = open(filename, O_RDWR); - if (vfio_group_fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, - strerror(errno)); - return -1; - } - return 0; - } - /* noiommu group found */ - } - - return vfio_group_fd; - } - /* if we're in a secondary process, request group fd from the primary - * process via mp channel. - */ - p->req = SOCKET_REQ_GROUP; - p->group_num = iommu_group_num; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - vfio_group_fd = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - vfio_group_fd = mp_rep->fds[0]; - } else if (p->result == SOCKET_NO_FD) { - RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); - vfio_group_fd = 0; - } - } - - free(mp_reply.msgs); - if (vfio_group_fd < 0) - RTE_LOG(ERR, EAL, " cannot request group fd\n"); - return vfio_group_fd; -} - -static struct vfio_config * -get_vfio_cfg_by_group_num(int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) { - if (vfio_cfg->vfio_groups[j].group_num == - iommu_group_num) - return vfio_cfg; - } - } - - return NULL; -} - -static int -vfio_get_group_fd(struct vfio_config *vfio_cfg, - int iommu_group_num) -{ - int i; - int vfio_group_fd; - struct vfio_group *cur_grp; - - /* check if we already have the group descriptor open */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) - return vfio_cfg->vfio_groups[i].fd; - - /* Lets see first if there is room for a new group */ - if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); - return -1; - } - - /* Now lets get an index for the new group */ - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num == -1) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS) { - RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); - return -1; - } - - vfio_group_fd = vfio_open_group_fd(iommu_group_num); - if (vfio_group_fd < 0) { - RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); - return -1; - } - - cur_grp->group_num = iommu_group_num; - cur_grp->fd = vfio_group_fd; - vfio_cfg->vfio_active_groups++; - - return vfio_group_fd; -} - -static struct vfio_config * -get_vfio_cfg_by_group_fd(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return vfio_cfg; - } - - return NULL; -} - -static struct vfio_config * -get_vfio_cfg_by_container_fd(int container_fd) -{ - int i; - - if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) - return default_vfio_cfg; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == container_fd) - return &vfio_cfgs[i]; - } - - return NULL; -} - -int -rte_vfio_get_group_fd(int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - - return vfio_get_group_fd(vfio_cfg, iommu_group_num); -} - -static int -get_vfio_group_idx(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i, j; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfg = &vfio_cfgs[i]; - for (j = 0; j < VFIO_MAX_GROUPS; j++) - if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) - return j; - } - - return -1; -} - -static void -vfio_group_device_get(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - else - vfio_cfg->vfio_groups[i].devices++; -} - -static void -vfio_group_device_put(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - else - vfio_cfg->vfio_groups[i].devices--; -} - -static int -vfio_group_device_count(int vfio_group_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return -1; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { - RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); - return -1; - } - - return vfio_cfg->vfio_groups[i].devices; -} - -static void -vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, - void *arg __rte_unused) -{ - rte_iova_t iova_start, iova_expected; - struct rte_memseg_list *msl; - struct rte_memseg *ms; - size_t cur_len = 0; - uint64_t va_start; - - msl = rte_mem_virt2memseg_list(addr); - - /* for IOVA as VA mode, no need to care for IOVA addresses */ - if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { - uint64_t vfio_va = (uint64_t)(uintptr_t)addr; - if (type == RTE_MEM_EVENT_ALLOC) - vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, - len, 1); - else - vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, - len, 0); - return; - } - -#ifdef RTE_ARCH_PPC_64 - ms = rte_mem_virt2memseg(addr, msl); - while (cur_len < len) { - int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); - - rte_fbarray_set_free(&msl->memseg_arr, idx); - cur_len += ms->len; - ++ms; - } - cur_len = 0; -#endif - /* memsegs are contiguous in memory */ - ms = rte_mem_virt2memseg(addr, msl); - - /* - * This memory is not guaranteed to be contiguous, but it still could - * be, or it could have some small contiguous chunks. Since the number - * of VFIO mappings is limited, and VFIO appears to not concatenate - * adjacent mappings, we have to do this ourselves. - * - * So, find contiguous chunks, then map them. - */ - va_start = ms->addr_64; - iova_start = iova_expected = ms->iova; - while (cur_len < len) { - bool new_contig_area = ms->iova != iova_expected; - bool last_seg = (len - cur_len) == ms->len; - bool skip_last = false; - - /* only do mappings when current contiguous area ends */ - if (new_contig_area) { - if (type == RTE_MEM_EVENT_ALLOC) - vfio_dma_mem_map(default_vfio_cfg, va_start, - iova_start, - iova_expected - iova_start, 1); - else - vfio_dma_mem_map(default_vfio_cfg, va_start, - iova_start, - iova_expected - iova_start, 0); - va_start = ms->addr_64; - iova_start = ms->iova; - } - /* some memory segments may have invalid IOVA */ - if (ms->iova == RTE_BAD_IOVA) { - RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", - ms->addr); - skip_last = true; - } - iova_expected = ms->iova + ms->len; - cur_len += ms->len; - ++ms; - - /* - * don't count previous segment, and don't attempt to - * dereference a potentially invalid pointer. - */ - if (skip_last && !last_seg) { - iova_expected = iova_start = ms->iova; - va_start = ms->addr_64; - } else if (!skip_last && last_seg) { - /* this is the last segment and we're not skipping */ - if (type == RTE_MEM_EVENT_ALLOC) - vfio_dma_mem_map(default_vfio_cfg, va_start, - iova_start, - iova_expected - iova_start, 1); - else - vfio_dma_mem_map(default_vfio_cfg, va_start, - iova_start, - iova_expected - iova_start, 0); - } - } -#ifdef RTE_ARCH_PPC_64 - cur_len = 0; - ms = rte_mem_virt2memseg(addr, msl); - while (cur_len < len) { - int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); - - rte_fbarray_set_used(&msl->memseg_arr, idx); - cur_len += ms->len; - ++ms; - } -#endif -} - -static int -vfio_sync_default_container(void) -{ - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply = {0}; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - int iommu_type_id; - unsigned int i; - - /* cannot be called from primary */ - if (rte_eal_process_type() != RTE_PROC_SECONDARY) - return -1; - - /* default container fd should have been opened in rte_vfio_enable() */ - if (!default_vfio_cfg->vfio_enabled || - default_vfio_cfg->vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, "VFIO support is not initialized\n"); - return -1; - } - - /* find default container's IOMMU type */ - p->req = SOCKET_REQ_IOMMU_TYPE; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - iommu_type_id = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK) - iommu_type_id = p->iommu_type_id; - } - free(mp_reply.msgs); - if (iommu_type_id < 0) { - RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n"); - return -1; - } - - /* we now have an fd for default container, as well as its IOMMU type. - * now, set up default VFIO container config to match. - */ - for (i = 0; i < RTE_DIM(iommu_types); i++) { - const struct vfio_iommu_type *t = &iommu_types[i]; - if (t->type_id != iommu_type_id) - continue; - - /* we found our IOMMU type */ - default_vfio_cfg->vfio_iommu_type = t; - - return 0; - } - RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n", - iommu_type_id); - return -1; -} - -int -rte_vfio_clear_group(int vfio_group_fd) -{ - int i; - struct vfio_config *vfio_cfg; - - vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid group fd!\n"); - return -1; - } - - i = get_vfio_group_idx(vfio_group_fd); - if (i < 0) - return -1; - vfio_cfg->vfio_groups[i].group_num = -1; - vfio_cfg->vfio_groups[i].fd = -1; - vfio_cfg->vfio_groups[i].devices = 0; - vfio_cfg->vfio_active_groups--; - - return 0; -} - -int -rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, - int *vfio_dev_fd, struct vfio_device_info *device_info) -{ - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; - struct vfio_config *vfio_cfg; - struct user_mem_maps *user_mem_maps; - int vfio_container_fd; - int vfio_group_fd; - int iommu_group_num; - int i, ret; - - /* get group number */ - ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); - if (ret == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - dev_addr); - return 1; - } - - /* if negative, something failed */ - if (ret < 0) - return -1; - - /* get the actual group fd */ - vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); - if (vfio_group_fd < 0) - return -1; - - /* if group_fd == 0, that means the device isn't managed by VFIO */ - if (vfio_group_fd == 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", - dev_addr); - return 1; - } - - /* - * at this point, we know that this group is viable (meaning, all devices - * are either bound to VFIO or not bound to anything) - */ - - /* check if the group is viable */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get group status, " - "error %i (%s)\n", dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { - RTE_LOG(ERR, EAL, " %s VFIO group is not viable! " - "Not all devices in IOMMU group bound to VFIO or unbound\n", - dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - vfio_container_fd = vfio_cfg->vfio_container_fd; - user_mem_maps = &vfio_cfg->mem_maps; - - /* check if group does not have a container yet */ - if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { - - /* add group to a container */ - ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, - &vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " - "error %i (%s)\n", dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* - * pick an IOMMU type and set up DMA mappings for container - * - * needs to be done only once, only when first group is - * assigned to a container and only in primary process. - * Note this can happen several times with the hotplug - * functionality. - */ - if (internal_config.process_type == RTE_PROC_PRIMARY && - vfio_cfg->vfio_active_groups == 1 && - vfio_group_device_count(vfio_group_fd) == 0) { - const struct vfio_iommu_type *t; - - /* select an IOMMU type which we will be using */ - t = vfio_set_iommu_type(vfio_container_fd); - if (!t) { - RTE_LOG(ERR, EAL, - " %s failed to select IOMMU type\n", - dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - /* lock memory hotplug before mapping and release it - * after registering callback, to prevent races - */ - rte_mcfg_mem_read_lock(); - if (vfio_cfg == default_vfio_cfg) - ret = t->dma_map_func(vfio_container_fd); - else - ret = 0; - if (ret) { - RTE_LOG(ERR, EAL, - " %s DMA remapping failed, error %i (%s)\n", - dev_addr, errno, strerror(errno)); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - rte_mcfg_mem_read_unlock(); - return -1; - } - - vfio_cfg->vfio_iommu_type = t; - - /* re-map all user-mapped segments */ - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* this IOMMU type may not support DMA mapping, but - * if we have mappings in the list - that means we have - * previously mapped something successfully, so we can - * be sure that DMA mapping is supported. - */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map; - map = &user_mem_maps->maps[i]; - - ret = t->dma_user_map_func( - vfio_container_fd, - map->addr, map->iova, map->len, - 1); - if (ret) { - RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " - "va: 0x%" PRIx64 " " - "iova: 0x%" PRIx64 " " - "len: 0x%" PRIu64 "\n", - map->addr, map->iova, - map->len); - rte_spinlock_recursive_unlock( - &user_mem_maps->lock); - rte_mcfg_mem_read_unlock(); - return -1; - } - } - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - - /* register callback for mem events */ - if (vfio_cfg == default_vfio_cfg) - ret = rte_mem_event_callback_register( - VFIO_MEM_EVENT_CLB_NAME, - vfio_mem_event_callback, NULL); - else - ret = 0; - /* unlock memory hotplug */ - rte_mcfg_mem_read_unlock(); - - if (ret && rte_errno != ENOTSUP) { - RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); - return -1; - } - if (ret) - RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); - else - RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); - } - } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && - vfio_cfg == default_vfio_cfg && - vfio_cfg->vfio_iommu_type == NULL) { - /* if we're not a primary process, we do not set up the VFIO - * container because it's already been set up by the primary - * process. instead, we simply ask the primary about VFIO type - * we are using, and set the VFIO config up appropriately. - */ - ret = vfio_sync_default_container(); - if (ret < 0) { - RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n"); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - /* we have successfully initialized VFIO, notify user */ - const struct vfio_iommu_type *t = - default_vfio_cfg->vfio_iommu_type; - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - } - - /* get a file descriptor for the device */ - *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); - if (*vfio_dev_fd < 0) { - /* if we cannot get a device fd, this implies a problem with - * the VFIO group or the container not having IOMMU configured. - */ - - RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", - dev_addr); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - - /* test and setup the device */ - ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); - if (ret) { - RTE_LOG(ERR, EAL, " %s cannot get device info, " - "error %i (%s)\n", dev_addr, errno, - strerror(errno)); - close(*vfio_dev_fd); - close(vfio_group_fd); - rte_vfio_clear_group(vfio_group_fd); - return -1; - } - vfio_group_device_get(vfio_group_fd); - - return 0; -} - -int -rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, - int vfio_dev_fd) -{ - struct vfio_group_status group_status = { - .argsz = sizeof(group_status) - }; - struct vfio_config *vfio_cfg; - int vfio_group_fd; - int iommu_group_num; - int ret; - - /* we don't want any DMA mapping messages to come while we're detaching - * VFIO device, because this might be the last device and we might need - * to unregister the callback. - */ - rte_mcfg_mem_read_lock(); - - /* get group number */ - ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); - if (ret <= 0) { - RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", - dev_addr); - /* This is an error at this point. */ - ret = -1; - goto out; - } - - /* get the actual group fd */ - vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); - if (vfio_group_fd <= 0) { - RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", - dev_addr); - ret = -1; - goto out; - } - - /* get the vfio_config it belongs to */ - vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); - vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; - - /* At this point we got an active group. Closing it will make the - * container detachment. If this is the last active group, VFIO kernel - * code will unset the container and the IOMMU mappings. - */ - - /* Closing a device */ - if (close(vfio_dev_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", - dev_addr); - ret = -1; - goto out; - } - - /* An VFIO group can have several devices attached. Just when there is - * no devices remaining should the group be closed. - */ - vfio_group_device_put(vfio_group_fd); - if (!vfio_group_device_count(vfio_group_fd)) { - - if (close(vfio_group_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", - dev_addr); - ret = -1; - goto out; - } - - if (rte_vfio_clear_group(vfio_group_fd) < 0) { - RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", - dev_addr); - ret = -1; - goto out; - } - } - - /* if there are no active device groups, unregister the callback to - * avoid spurious attempts to map/unmap memory from VFIO. - */ - if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && - rte_eal_process_type() != RTE_PROC_SECONDARY) - rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, - NULL); - - /* success */ - ret = 0; - -out: - rte_mcfg_mem_read_unlock(); - return ret; -} - -int -rte_vfio_enable(const char *modname) -{ - /* initialize group list */ - int i, j; - int vfio_available; - - rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; - - for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { - vfio_cfgs[i].vfio_container_fd = -1; - vfio_cfgs[i].vfio_active_groups = 0; - vfio_cfgs[i].vfio_iommu_type = NULL; - vfio_cfgs[i].mem_maps.lock = lock; - - for (j = 0; j < VFIO_MAX_GROUPS; j++) { - vfio_cfgs[i].vfio_groups[j].fd = -1; - vfio_cfgs[i].vfio_groups[j].group_num = -1; - vfio_cfgs[i].vfio_groups[j].devices = 0; - } - } - - /* inform the user that we are probing for VFIO */ - RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); - - /* check if vfio module is loaded */ - vfio_available = rte_eal_check_module(modname); - - /* return error directly */ - if (vfio_available == -1) { - RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); - return -1; - } - - /* return 0 if VFIO modules not loaded */ - if (vfio_available == 0) { - RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " - "skipping VFIO support...\n"); - return 0; - } - - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* open a new container */ - default_vfio_cfg->vfio_container_fd = - rte_vfio_get_container_fd(); - } else { - /* get the default container from the primary process */ - default_vfio_cfg->vfio_container_fd = - vfio_get_default_container_fd(); - } - - /* check if we have VFIO driver enabled */ - if (default_vfio_cfg->vfio_container_fd != -1) { - RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); - default_vfio_cfg->vfio_enabled = 1; - } else { - RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); - } - - return 0; -} - -int -rte_vfio_is_enabled(const char *modname) -{ - const int mod_available = rte_eal_check_module(modname) > 0; - return default_vfio_cfg->vfio_enabled && mod_available; -} - -int -vfio_get_default_container_fd(void) -{ - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply = {0}; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - if (default_vfio_cfg->vfio_enabled) - return default_vfio_cfg->vfio_container_fd; - - if (internal_config.process_type == RTE_PROC_PRIMARY) { - /* if we were secondary process we would try requesting - * container fd from the primary, but we're the primary - * process so just exit here - */ - return -1; - } - - p->req = SOCKET_REQ_DEFAULT_CONTAINER; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - free(mp_reply.msgs); - return mp_rep->fds[0]; - } - } - - free(mp_reply.msgs); - RTE_LOG(ERR, EAL, " cannot request default container fd\n"); - return -1; -} - -int -vfio_get_iommu_type(void) -{ - if (default_vfio_cfg->vfio_iommu_type == NULL) - return -1; - - return default_vfio_cfg->vfio_iommu_type->type_id; -} - -const struct vfio_iommu_type * -vfio_set_iommu_type(int vfio_container_fd) -{ - unsigned idx; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, - t->type_id); - if (!ret) { - RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", - t->type_id, t->name); - return t; - } - /* not an error, there may be more supported IOMMU types */ - RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " - "error %i (%s)\n", t->type_id, t->name, errno, - strerror(errno)); - } - /* if we didn't find a suitable IOMMU type, fail */ - return NULL; -} - -int -vfio_has_supported_extensions(int vfio_container_fd) -{ - int ret; - unsigned idx, n_extensions = 0; - for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { - const struct vfio_iommu_type *t = &iommu_types[idx]; - - ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, - t->type_id); - if (ret < 0) { - RTE_LOG(ERR, EAL, " could not get IOMMU type, " - "error %i (%s)\n", errno, - strerror(errno)); - close(vfio_container_fd); - return -1; - } else if (ret == 1) { - /* we found a supported extension */ - n_extensions++; - } - RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", - t->type_id, t->name, - ret ? "supported" : "not supported"); - } - - /* if we didn't find any supported IOMMU types, fail */ - if (!n_extensions) { - close(vfio_container_fd); - return -1; - } - - return 0; -} - -int -rte_vfio_get_container_fd(void) -{ - int ret, vfio_container_fd; - struct rte_mp_msg mp_req, *mp_rep; - struct rte_mp_reply mp_reply = {0}; - struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; - struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; - - - /* if we're in a primary process, try to open the container */ - if (internal_config.process_type == RTE_PROC_PRIMARY) { - vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); - if (vfio_container_fd < 0) { - RTE_LOG(ERR, EAL, " cannot open VFIO container, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* check VFIO API version */ - ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); - if (ret != VFIO_API_VERSION) { - if (ret < 0) - RTE_LOG(ERR, EAL, " could not get VFIO API version, " - "error %i (%s)\n", errno, strerror(errno)); - else - RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); - close(vfio_container_fd); - return -1; - } - - ret = vfio_has_supported_extensions(vfio_container_fd); - if (ret) { - RTE_LOG(ERR, EAL, " no supported IOMMU " - "extensions found!\n"); - return -1; - } - - return vfio_container_fd; - } - /* - * if we're in a secondary process, request container fd from the - * primary process via mp channel - */ - p->req = SOCKET_REQ_CONTAINER; - strcpy(mp_req.name, EAL_VFIO_MP); - mp_req.len_param = sizeof(*p); - mp_req.num_fds = 0; - - vfio_container_fd = -1; - if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && - mp_reply.nb_received == 1) { - mp_rep = &mp_reply.msgs[0]; - p = (struct vfio_mp_param *)mp_rep->param; - if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { - vfio_container_fd = mp_rep->fds[0]; - free(mp_reply.msgs); - return vfio_container_fd; - } - } - - free(mp_reply.msgs); - RTE_LOG(ERR, EAL, " cannot request container fd\n"); - return -1; -} - -int -rte_vfio_get_group_num(const char *sysfs_base, - const char *dev_addr, int *iommu_group_num) -{ - char linkname[PATH_MAX]; - char filename[PATH_MAX]; - char *tok[16], *group_tok, *end; - int ret; - - memset(linkname, 0, sizeof(linkname)); - memset(filename, 0, sizeof(filename)); - - /* try to find out IOMMU group for this device */ - snprintf(linkname, sizeof(linkname), - "%s/%s/iommu_group", sysfs_base, dev_addr); - - ret = readlink(linkname, filename, sizeof(filename)); - - /* if the link doesn't exist, no VFIO for us */ - if (ret < 0) - return 0; - - ret = rte_strsplit(filename, sizeof(filename), - tok, RTE_DIM(tok), '/'); - - if (ret <= 0) { - RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); - return -1; - } - - /* IOMMU group is always the last token */ - errno = 0; - group_tok = tok[ret - 1]; - end = group_tok; - *iommu_group_num = strtol(group_tok, &end, 10); - if ((end != group_tok && *end != '\0') || errno != 0) { - RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); - return -1; - } - - return 1; -} - -static int -type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - size_t len, void *arg) -{ - int *vfio_container_fd = arg; - - if (msl->external) - return 0; - - return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, - len, 1); -} - -static int -type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, - void *arg) -{ - int *vfio_container_fd = arg; - - /* skip external memory that isn't a heap */ - if (msl->external && !msl->heap) - return 0; - - /* skip any segments with invalid IOVA addresses */ - if (ms->iova == RTE_BAD_IOVA) - return 0; - - /* if IOVA mode is VA, we've already mapped the internal segments */ - if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) - return 0; - - return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, - ms->len, 1); -} - -static int -vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct vfio_iommu_type1_dma_map dma_map; - struct vfio_iommu_type1_dma_unmap dma_unmap; - int ret; - - if (do_map != 0) { - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = vaddr; - dma_map.size = len; - dma_map.iova = iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - if (ret) { - /** - * In case the mapping was already done EEXIST will be - * returned from kernel. - */ - if (errno == EEXIST) { - RTE_LOG(DEBUG, EAL, - " Memory segment is already mapped," - " skipping"); - } else { - RTE_LOG(ERR, EAL, - " cannot set up DMA remapping," - " error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } - } else { - memset(&dma_unmap, 0, sizeof(dma_unmap)); - dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); - dma_unmap.size = len; - dma_unmap.iova = iova; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, - &dma_unmap); - if (ret) { - RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -static int -vfio_type1_dma_map(int vfio_container_fd) -{ - if (rte_eal_iova_mode() == RTE_IOVA_VA) { - /* with IOVA as VA mode, we can get away with mapping contiguous - * chunks rather than going page-by-page. - */ - int ret = rte_memseg_contig_walk(type1_map_contig, - &vfio_container_fd); - if (ret) - return ret; - /* we have to continue the walk because we've skipped the - * external segments during the config walk. - */ - } - return rte_memseg_walk(type1_map, &vfio_container_fd); -} - -static int -vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct vfio_iommu_type1_dma_map dma_map; - struct vfio_iommu_type1_dma_unmap dma_unmap; - int ret; - struct vfio_iommu_spapr_register_memory reg = { - .argsz = sizeof(reg), - .flags = 0 - }; - reg.vaddr = (uintptr_t) vaddr; - reg.size = len; - - if (do_map != 0) { - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - memset(&dma_map, 0, sizeof(dma_map)); - dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); - dma_map.vaddr = vaddr; - dma_map.size = len; - dma_map.iova = iova; - dma_map.flags = VFIO_DMA_MAP_FLAG_READ | - VFIO_DMA_MAP_FLAG_WRITE; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); - if (ret) { - /** - * In case the mapping was already done EBUSY will be - * returned from kernel. - */ - if (errno == EBUSY) { - RTE_LOG(DEBUG, EAL, - " Memory segment is already mapped," - " skipping"); - } else { - RTE_LOG(ERR, EAL, - " cannot set up DMA remapping," - " error %i (%s)\n", errno, - strerror(errno)); - return -1; - } - } - - } else { - memset(&dma_unmap, 0, sizeof(dma_unmap)); - dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); - dma_unmap.size = len; - dma_unmap.iova = iova; - - ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, - &dma_unmap); - if (ret) { - RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); - if (ret) { - RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", - errno, strerror(errno)); - return -1; - } - } - - return 0; -} - -static int -vfio_spapr_map_walk(const struct rte_memseg_list *msl, - const struct rte_memseg *ms, void *arg) -{ - int *vfio_container_fd = arg; - - /* skip external memory that isn't a heap */ - if (msl->external && !msl->heap) - return 0; - - /* skip any segments with invalid IOVA addresses */ - if (ms->iova == RTE_BAD_IOVA) - return 0; - - return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, - ms->len, 1); -} - -static int -vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, - const struct rte_memseg *ms, void *arg) -{ - int *vfio_container_fd = arg; - - /* skip external memory that isn't a heap */ - if (msl->external && !msl->heap) - return 0; - - /* skip any segments with invalid IOVA addresses */ - if (ms->iova == RTE_BAD_IOVA) - return 0; - - return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, - ms->len, 0); -} - -struct spapr_walk_param { - uint64_t window_size; - uint64_t hugepage_sz; -}; - -static int -vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, - const struct rte_memseg *ms, void *arg) -{ - struct spapr_walk_param *param = arg; - uint64_t max = ms->iova + ms->len; - - /* skip external memory that isn't a heap */ - if (msl->external && !msl->heap) - return 0; - - /* skip any segments with invalid IOVA addresses */ - if (ms->iova == RTE_BAD_IOVA) - return 0; - - if (max > param->window_size) { - param->hugepage_sz = ms->hugepage_sz; - param->window_size = max; - } - - return 0; -} - -static int -vfio_spapr_create_new_dma_window(int vfio_container_fd, - struct vfio_iommu_spapr_tce_create *create) { - struct vfio_iommu_spapr_tce_remove remove = { - .argsz = sizeof(remove), - }; - struct vfio_iommu_spapr_tce_info info = { - .argsz = sizeof(info), - }; - int ret; - - /* query spapr iommu info */ - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); - if (ret) { - RTE_LOG(ERR, EAL, " cannot get iommu info, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* remove default DMA of 32 bit window */ - remove.start_addr = info.dma32_window_start; - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); - if (ret) { - RTE_LOG(ERR, EAL, " cannot remove default DMA window, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - - /* create new DMA window */ - ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); - if (ret) { -#ifdef VFIO_IOMMU_SPAPR_INFO_DDW - /* try possible page_shift and levels for workaround */ - uint32_t levels; - - for (levels = create->levels + 1; - ret && levels <= info.ddw.levels; levels++) { - create->levels = levels; - ret = ioctl(vfio_container_fd, - VFIO_IOMMU_SPAPR_TCE_CREATE, create); - } -#endif - if (ret) { - RTE_LOG(ERR, EAL, " cannot create new DMA window, " - "error %i (%s)\n", errno, strerror(errno)); - return -1; - } - } - - if (create->start_addr != 0) { - RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); - return -1; - } - - return 0; -} - -static int -vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - struct spapr_walk_param param; - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct vfio_config *vfio_cfg; - struct user_mem_maps *user_mem_maps; - int i, ret = 0; - - vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, " invalid container fd!\n"); - return -1; - } - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* check if window size needs to be adjusted */ - memset(¶m, 0, sizeof(param)); - - /* we're inside a callback so use thread-unsafe version */ - if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, - ¶m) < 0) { - RTE_LOG(ERR, EAL, "Could not get window size\n"); - ret = -1; - goto out; - } - - /* also check user maps */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - uint64_t max = user_mem_maps->maps[i].iova + - user_mem_maps->maps[i].len; - param.window_size = RTE_MAX(param.window_size, max); - } - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(param.window_size); - create.page_shift = __builtin_ctzll(param.hugepage_sz); - create.levels = 1; - - if (do_map) { - /* re-create window and remap the entire memory */ - if (iova + len > create.window_size) { - /* release all maps before recreating the window */ - if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk, - &vfio_container_fd) < 0) { - RTE_LOG(ERR, EAL, "Could not release DMA maps\n"); - ret = -1; - goto out; - } - /* release all user maps */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map = - &user_mem_maps->maps[i]; - if (vfio_spapr_dma_do_map(vfio_container_fd, - map->addr, map->iova, map->len, - 0)) { - RTE_LOG(ERR, EAL, "Could not release user DMA maps\n"); - ret = -1; - goto out; - } - } - create.window_size = rte_align64pow2(iova + len); - if (vfio_spapr_create_new_dma_window(vfio_container_fd, - &create) < 0) { - RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); - ret = -1; - goto out; - } - /* we're inside a callback, so use thread-unsafe version - */ - if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, - &vfio_container_fd) < 0) { - RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); - ret = -1; - goto out; - } - /* remap all user maps */ - for (i = 0; i < user_mem_maps->n_maps; i++) { - struct user_mem_map *map = - &user_mem_maps->maps[i]; - if (vfio_spapr_dma_do_map(vfio_container_fd, - map->addr, map->iova, map->len, - 1)) { - RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); - ret = -1; - goto out; - } - } - } - if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) { - RTE_LOG(ERR, EAL, "Failed to map DMA\n"); - ret = -1; - goto out; - } - } else { - /* for unmap, check if iova within DMA window */ - if (iova > create.window_size) { - RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); - ret = -1; - goto out; - } - - vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); - } -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -static int -vfio_spapr_dma_map(int vfio_container_fd) -{ - struct vfio_iommu_spapr_tce_create create = { - .argsz = sizeof(create), - }; - struct spapr_walk_param param; - - memset(¶m, 0, sizeof(param)); - - /* create DMA window from 0 to max(phys_addr + len) */ - rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); - - /* sPAPR requires window size to be a power of 2 */ - create.window_size = rte_align64pow2(param.window_size); - create.page_shift = __builtin_ctzll(param.hugepage_sz); - create.levels = 1; - - if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { - RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); - return -1; - } - - /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ - if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) - return -1; - - return 0; -} - -static int -vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - -static int -vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, - uint64_t __rte_unused vaddr, - uint64_t __rte_unused iova, uint64_t __rte_unused len, - int __rte_unused do_map) -{ - /* No-IOMMU mode does not need DMA mapping */ - return 0; -} - -static int -vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map) -{ - const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; - - if (!t) { - RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); - rte_errno = ENODEV; - return -1; - } - - if (!t->dma_user_map_func) { - RTE_LOG(ERR, EAL, - " VFIO custom DMA region maping not supported by IOMMU %s\n", - t->name); - rte_errno = ENOTSUP; - return -1; - } - - return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, - len, do_map); -} - -static int -container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct user_mem_map *new_map; - struct user_mem_maps *user_mem_maps; - int ret = 0; - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { - RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); - rte_errno = ENOMEM; - ret = -1; - goto out; - } - /* map the entry */ - if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { - /* technically, this will fail if there are currently no devices - * plugged in, even if a device were added later, this mapping - * might have succeeded. however, since we cannot verify if this - * is a valid mapping without having a device attached, consider - * this to be unsupported, because we can't just store any old - * mapping and pollute list of active mappings willy-nilly. - */ - RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); - ret = -1; - goto out; - } - /* create new user mem map entry */ - new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; - new_map->addr = vaddr; - new_map->iova = iova; - new_map->len = len; - - compact_user_maps(user_mem_maps); -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -static int -container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct user_mem_map *map, *new_map = NULL; - struct user_mem_maps *user_mem_maps; - int ret = 0; - - user_mem_maps = &vfio_cfg->mem_maps; - rte_spinlock_recursive_lock(&user_mem_maps->lock); - - /* find our mapping */ - map = find_user_mem_map(user_mem_maps, vaddr, iova, len); - if (!map) { - RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); - rte_errno = EINVAL; - ret = -1; - goto out; - } - if (map->addr != vaddr || map->iova != iova || map->len != len) { - /* we're partially unmapping a previously mapped region, so we - * need to split entry into two. - */ - if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { - RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); - rte_errno = ENOMEM; - ret = -1; - goto out; - } - new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; - } - - /* unmap the entry */ - if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { - /* there may not be any devices plugged in, so unmapping will - * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't - * stop us from removing the mapping, as the assumption is we - * won't be needing this memory any more and thus will want to - * prevent it from being remapped again on hotplug. so, only - * fail if we indeed failed to unmap (e.g. if the mapping was - * within our mapped range but had invalid alignment). - */ - if (rte_errno != ENODEV && rte_errno != ENOTSUP) { - RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); - ret = -1; - goto out; - } else { - RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); - } - } - /* remove map from the list of active mappings */ - if (new_map != NULL) { - adjust_map(map, new_map, vaddr, len); - - /* if we've created a new map by splitting, sort everything */ - if (!is_null_map(new_map)) { - compact_user_maps(user_mem_maps); - } else { - /* we've created a new mapping, but it was unused */ - user_mem_maps->n_maps--; - } - } else { - memset(map, 0, sizeof(*map)); - compact_user_maps(user_mem_maps); - user_mem_maps->n_maps--; - } - -out: - rte_spinlock_recursive_unlock(&user_mem_maps->lock); - return ret; -} - -int -rte_vfio_noiommu_is_enabled(void) -{ - int fd; - ssize_t cnt; - char c; - - fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); - if (fd < 0) { - if (errno != ENOENT) { - RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", - errno, strerror(errno)); - return -1; - } - /* - * else the file does not exists - * i.e. noiommu is not enabled - */ - return 0; - } - - cnt = read(fd, &c, 1); - close(fd); - if (cnt != 1) { - RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " - "file %i (%s)\n", errno, strerror(errno)); - return -1; - } - - return c == 'Y'; -} - -int -rte_vfio_container_create(void) -{ - int i; - - /* Find an empty slot to store new vfio config */ - for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { - if (vfio_cfgs[i].vfio_container_fd == -1) - break; - } - - if (i == VFIO_MAX_CONTAINERS) { - RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); - return -1; - } - - vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); - if (vfio_cfgs[i].vfio_container_fd < 0) { - RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); - return -1; - } - - return vfio_cfgs[i].vfio_container_fd; -} - -int -rte_vfio_container_destroy(int container_fd) -{ - struct vfio_config *vfio_cfg; - int i; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - for (i = 0; i < VFIO_MAX_GROUPS; i++) - if (vfio_cfg->vfio_groups[i].group_num != -1) - rte_vfio_container_group_unbind(container_fd, - vfio_cfg->vfio_groups[i].group_num); - - close(container_fd); - vfio_cfg->vfio_container_fd = -1; - vfio_cfg->vfio_active_groups = 0; - vfio_cfg->vfio_iommu_type = NULL; - - return 0; -} - -int -rte_vfio_container_group_bind(int container_fd, int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return vfio_get_group_fd(vfio_cfg, iommu_group_num); -} - -int -rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) -{ - struct vfio_config *vfio_cfg; - struct vfio_group *cur_grp = NULL; - int i; - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - for (i = 0; i < VFIO_MAX_GROUPS; i++) { - if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { - cur_grp = &vfio_cfg->vfio_groups[i]; - break; - } - } - - /* This should not happen */ - if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { - RTE_LOG(ERR, EAL, "Specified group number not found\n"); - return -1; - } - - if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { - RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" - " iommu_group_num %d\n", iommu_group_num); - return -1; - } - cur_grp->group_num = -1; - cur_grp->fd = -1; - cur_grp->devices = 0; - vfio_cfg->vfio_active_groups--; - - return 0; -} - -int -rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct vfio_config *vfio_cfg; - - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return container_dma_map(vfio_cfg, vaddr, iova, len); -} - -int -rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, - uint64_t len) -{ - struct vfio_config *vfio_cfg; - - if (len == 0) { - rte_errno = EINVAL; - return -1; - } - - vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); - if (vfio_cfg == NULL) { - RTE_LOG(ERR, EAL, "Invalid container fd\n"); - return -1; - } - - return container_dma_unmap(vfio_cfg, vaddr, iova, len); -} - -#else - -int -rte_vfio_setup_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *vfio_dev_fd, - __rte_unused struct vfio_device_info *device_info) -{ - return -1; -} - -int -rte_vfio_release_device(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, __rte_unused int fd) -{ - return -1; -} - -int -rte_vfio_enable(__rte_unused const char *modname) -{ - return -1; -} - -int -rte_vfio_is_enabled(__rte_unused const char *modname) -{ - return -1; -} - -int -rte_vfio_noiommu_is_enabled(void) -{ - return -1; -} - -int -rte_vfio_clear_group(__rte_unused int vfio_group_fd) -{ - return -1; -} - -int -rte_vfio_get_group_num(__rte_unused const char *sysfs_base, - __rte_unused const char *dev_addr, - __rte_unused int *iommu_group_num) -{ - return -1; -} - -int -rte_vfio_get_container_fd(void) -{ - return -1; -} - -int -rte_vfio_get_group_fd(__rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_create(void) -{ - return -1; -} - -int -rte_vfio_container_destroy(__rte_unused int container_fd) -{ - return -1; -} - -int -rte_vfio_container_group_bind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_group_unbind(__rte_unused int container_fd, - __rte_unused int iommu_group_num) -{ - return -1; -} - -int -rte_vfio_container_dma_map(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -int -rte_vfio_container_dma_unmap(__rte_unused int container_fd, - __rte_unused uint64_t vaddr, - __rte_unused uint64_t iova, - __rte_unused uint64_t len) -{ - return -1; -} - -#endif /* VFIO_PRESENT */ diff --git a/lib/librte_eal/linux/eal/eal_vfio.h b/lib/librte_eal/linux/eal/eal_vfio.h deleted file mode 100644 index cb2d35fb12..0000000000 --- a/lib/librte_eal/linux/eal/eal_vfio.h +++ /dev/null @@ -1,158 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2014 Intel Corporation - */ - -#ifndef EAL_VFIO_H_ -#define EAL_VFIO_H_ - -#include - -/* - * determine if VFIO is present on the system - */ -#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) -#include -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) -#define VFIO_PRESENT -#else -#pragma message("VFIO configured but not supported by this kernel, disabling.") -#endif /* kernel version >= 3.6.0 */ -#endif /* RTE_EAL_VFIO */ - -#ifdef VFIO_PRESENT - -#include -#include - -#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU - -#ifndef VFIO_SPAPR_TCE_v2_IOMMU -#define RTE_VFIO_SPAPR 7 -#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) -#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) -#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) -#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) - -struct vfio_iommu_spapr_register_memory { - uint32_t argsz; - uint32_t flags; - uint64_t vaddr; - uint64_t size; -}; - -struct vfio_iommu_spapr_tce_create { - uint32_t argsz; - uint32_t flags; - /* in */ - uint32_t page_shift; - uint32_t __resv1; - uint64_t window_size; - uint32_t levels; - uint32_t __resv2; - /* out */ - uint64_t start_addr; -}; - -struct vfio_iommu_spapr_tce_remove { - uint32_t argsz; - uint32_t flags; - /* in */ - uint64_t start_addr; -}; - -struct vfio_iommu_spapr_tce_ddw_info { - uint64_t pgsizes; - uint32_t max_dynamic_windows_supported; - uint32_t levels; -}; - -/* SPAPR_v2 is not present, but SPAPR might be */ -#ifndef VFIO_SPAPR_TCE_IOMMU -#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) - -struct vfio_iommu_spapr_tce_info { - uint32_t argsz; - uint32_t flags; - uint32_t dma32_window_start; - uint32_t dma32_window_size; - struct vfio_iommu_spapr_tce_ddw_info ddw; -}; -#endif /* VFIO_SPAPR_TCE_IOMMU */ - -#else /* VFIO_SPAPR_TCE_v2_IOMMU */ -#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU -#endif - -#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS -#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS - -/* - * we don't need to store device fd's anywhere since they can be obtained from - * the group fd via an ioctl() call. - */ -struct vfio_group { - int group_num; - int fd; - int devices; -}; - -/* DMA mapping function prototype. - * Takes VFIO container fd as a parameter. - * Returns 0 on success, -1 on error. - * */ -typedef int (*vfio_dma_func_t)(int); - -/* Custom memory region DMA mapping function prototype. - * Takes VFIO container fd, virtual address, phisical address, length and - * operation type (0 to unmap 1 for map) as a parameters. - * Returns 0 on success, -1 on error. - **/ -typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, - uint64_t len, int do_map); - -struct vfio_iommu_type { - int type_id; - const char *name; - vfio_dma_user_func_t dma_user_map_func; - vfio_dma_func_t dma_map_func; -}; - -/* get the vfio container that devices are bound to by default */ -int vfio_get_default_container_fd(void); - -/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ -const struct vfio_iommu_type * -vfio_set_iommu_type(int vfio_container_fd); - -int -vfio_get_iommu_type(void); - -/* check if we have any supported extensions */ -int -vfio_has_supported_extensions(int vfio_container_fd); - -int vfio_mp_sync_setup(void); - -#define EAL_VFIO_MP "eal_vfio_mp_sync" - -#define SOCKET_REQ_CONTAINER 0x100 -#define SOCKET_REQ_GROUP 0x200 -#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 -#define SOCKET_REQ_IOMMU_TYPE 0x800 -#define SOCKET_OK 0x0 -#define SOCKET_NO_FD 0x1 -#define SOCKET_ERR 0xFF - -struct vfio_mp_param { - int req; - int result; - RTE_STD_C11 - union { - int group_num; - int iommu_type_id; - }; -}; - -#endif /* VFIO_PRESENT */ - -#endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c deleted file mode 100644 index 5f2a5fc1d9..0000000000 --- a/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2018 Intel Corporation - */ - -#include -#include - -#include -#include -#include -#include -#include - -#include "eal_vfio.h" - -/** - * @file - * VFIO socket for communication between primary and secondary processes. - * - * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". - */ - -#ifdef VFIO_PRESENT - -static int -vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) -{ - int fd = -1; - int ret; - struct rte_mp_msg reply; - struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; - const struct vfio_mp_param *m = - (const struct vfio_mp_param *)msg->param; - - if (msg->len_param != sizeof(*m)) { - RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); - return -1; - } - - memset(&reply, 0, sizeof(reply)); - - switch (m->req) { - case SOCKET_REQ_GROUP: - r->req = SOCKET_REQ_GROUP; - r->group_num = m->group_num; - fd = rte_vfio_get_group_fd(m->group_num); - if (fd < 0) - r->result = SOCKET_ERR; - else if (fd == 0) - /* if VFIO group exists but isn't bound to VFIO driver */ - r->result = SOCKET_NO_FD; - else { - /* if group exists and is bound to VFIO driver */ - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_CONTAINER: - r->req = SOCKET_REQ_CONTAINER; - fd = rte_vfio_get_container_fd(); - if (fd < 0) - r->result = SOCKET_ERR; - else { - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_DEFAULT_CONTAINER: - r->req = SOCKET_REQ_DEFAULT_CONTAINER; - fd = vfio_get_default_container_fd(); - if (fd < 0) - r->result = SOCKET_ERR; - else { - r->result = SOCKET_OK; - reply.num_fds = 1; - reply.fds[0] = fd; - } - break; - case SOCKET_REQ_IOMMU_TYPE: - { - int iommu_type_id; - - r->req = SOCKET_REQ_IOMMU_TYPE; - - iommu_type_id = vfio_get_iommu_type(); - - if (iommu_type_id < 0) - r->result = SOCKET_ERR; - else { - r->iommu_type_id = iommu_type_id; - r->result = SOCKET_OK; - } - break; - } - default: - RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); - return -1; - } - - strcpy(reply.name, EAL_VFIO_MP); - reply.len_param = sizeof(*r); - - ret = rte_mp_reply(&reply, peer); - if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) - close(fd); - return ret; -} - -int -vfio_mp_sync_setup(void) -{ - if (rte_eal_process_type() == RTE_PROC_PRIMARY) { - int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); - if (ret && rte_errno != ENOTSUP) - return -1; - } - - return 0; -} - -#endif diff --git a/lib/librte_eal/linux/eal/include/rte_kni_common.h b/lib/librte_eal/linux/eal/include/rte_kni_common.h deleted file mode 100644 index 7313ef504e..0000000000 --- a/lib/librte_eal/linux/eal/include/rte_kni_common.h +++ /dev/null @@ -1,137 +0,0 @@ -/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ -/* - * Copyright(c) 2007-2014 Intel Corporation. - */ - -#ifndef _RTE_KNI_COMMON_H_ -#define _RTE_KNI_COMMON_H_ - -#ifdef __KERNEL__ -#include -#include -#define RTE_STD_C11 -#else -#include -#include -#endif - -/* - * KNI name is part of memzone name. Must not exceed IFNAMSIZ. - */ -#define RTE_KNI_NAMESIZE 16 - -#define RTE_CACHE_LINE_MIN_SIZE 64 - -/* - * Request id. - */ -enum rte_kni_req_id { - RTE_KNI_REQ_UNKNOWN = 0, - RTE_KNI_REQ_CHANGE_MTU, - RTE_KNI_REQ_CFG_NETWORK_IF, - RTE_KNI_REQ_CHANGE_MAC_ADDR, - RTE_KNI_REQ_CHANGE_PROMISC, - RTE_KNI_REQ_CHANGE_ALLMULTI, - RTE_KNI_REQ_MAX, -}; - -/* - * Structure for KNI request. - */ -struct rte_kni_request { - uint32_t req_id; /**< Request id */ - RTE_STD_C11 - union { - uint32_t new_mtu; /**< New MTU */ - uint8_t if_up; /**< 1: interface up, 0: interface down */ - uint8_t mac_addr[6]; /**< MAC address for interface */ - uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ - uint8_t allmulti; /**< 1: all-multicast mode enable, 0: disable */ - }; - int32_t result; /**< Result for processing request */ -} __attribute__((__packed__)); - -/* - * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO - * Write and read should wrap around. Fifo is empty when write == read - * Writing should never overwrite the read position - */ -struct rte_kni_fifo { -#ifdef RTE_USE_C11_MEM_MODEL - unsigned write; /**< Next position to be written*/ - unsigned read; /**< Next position to be read */ -#else - volatile unsigned write; /**< Next position to be written*/ - volatile unsigned read; /**< Next position to be read */ -#endif - unsigned len; /**< Circular buffer length */ - unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ - void *volatile buffer[]; /**< The buffer contains mbuf pointers */ -}; - -/* - * The kernel image of the rte_mbuf struct, with only the relevant fields. - * Padding is necessary to assure the offsets of these fields - */ -struct rte_kni_mbuf { - void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); - uint64_t buf_physaddr; - uint16_t data_off; /**< Start address of data in segment buffer. */ - char pad1[2]; - uint16_t nb_segs; /**< Number of segments. */ - char pad4[2]; - uint64_t ol_flags; /**< Offload features. */ - char pad2[4]; - uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ - uint16_t data_len; /**< Amount of data in segment buffer. */ - - /* fields on second cache line */ - char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); - void *pool; - void *next; /**< Physical address of next mbuf in kernel. */ -}; - -/* - * Struct used to create a KNI device. Passed to the kernel in IOCTL call - */ - -struct rte_kni_device_info { - char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ - - phys_addr_t tx_phys; - phys_addr_t rx_phys; - phys_addr_t alloc_phys; - phys_addr_t free_phys; - - /* Used by Ethtool */ - phys_addr_t req_phys; - phys_addr_t resp_phys; - phys_addr_t sync_phys; - void * sync_va; - - /* mbuf mempool */ - void * mbuf_va; - phys_addr_t mbuf_phys; - - uint16_t group_id; /**< Group ID */ - uint32_t core_id; /**< core ID to bind for kernel thread */ - - __extension__ - uint8_t force_bind : 1; /**< Flag for kernel thread binding */ - - /* mbuf size */ - unsigned mbuf_size; - unsigned int mtu; - unsigned int min_mtu; - unsigned int max_mtu; - uint8_t mac_addr[6]; - uint8_t iova_mode; -}; - -#define KNI_DEVICE "kni" - -#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) -#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) -#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) - -#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/lib/librte_eal/linux/eal/include/rte_os.h b/lib/librte_eal/linux/eal/include/rte_os.h deleted file mode 100644 index 218d4fa86e..0000000000 --- a/lib/librte_eal/linux/eal/include/rte_os.h +++ /dev/null @@ -1,33 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2019 Intel Corporation - */ - -#ifndef _RTE_OS_H_ -#define _RTE_OS_H_ - -/** - * This is header should contain any function/macro definition - * which are not supported natively or named differently in the - * linux OS. Functions will be added in future releases. - */ - -#include - -typedef cpu_set_t rte_cpuset_t; -#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) -#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) -#define RTE_CPU_FILL(set) do \ -{ \ - unsigned int i; \ - CPU_ZERO(set); \ - for (i = 0; i < CPU_SETSIZE; i++) \ - CPU_SET(i, set); \ -} while (0) -#define RTE_CPU_NOT(dst, src) do \ -{ \ - cpu_set_t tmp; \ - RTE_CPU_FILL(&tmp); \ - CPU_XOR(dst, &tmp, src); \ -} while (0) - -#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/linux/eal/meson.build b/lib/librte_eal/linux/eal/meson.build deleted file mode 100644 index b02b0695f5..0000000000 --- a/lib/librte_eal/linux/eal/meson.build +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2017 Intel Corporation - -eal_inc += include_directories('include') - -env_objs = [] -env_headers = files( - 'include/rte_kni_common.h', - 'include/rte_os.h', -) -env_sources = files('eal_alarm.c', - 'eal_cpuflags.c', - 'eal_debug.c', - 'eal_hugepage_info.c', - 'eal_interrupts.c', - 'eal_memalloc.c', - 'eal_lcore.c', - 'eal_log.c', - 'eal_thread.c', - 'eal_timer.c', - 'eal_vfio.c', - 'eal_vfio_mp_sync.c', - 'eal.c', - 'eal_memory.c', - 'eal_dev.c', -) - -deps += ['kvargs'] -if has_libnuma == 1 - dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) -endif diff --git a/lib/librte_eal/linux/eal_alarm.c b/lib/librte_eal/linux/eal_alarm.c new file mode 100644 index 0000000000..0924c9205c --- /dev/null +++ b/lib/librte_eal/linux/eal_alarm.c @@ -0,0 +1,244 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TFD_NONBLOCK +#include +#define TFD_NONBLOCK O_NONBLOCK +#endif + +#define NS_PER_US 1000 +#define US_PER_MS 1000 +#define MS_PER_S 1000 +#ifndef US_PER_S +#define US_PER_S (US_PER_MS * MS_PER_S) +#endif + +#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */ +#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW +#else +#define CLOCK_TYPE_ID CLOCK_MONOTONIC +#endif + +struct alarm_entry { + LIST_ENTRY(alarm_entry) next; + struct timeval time; + rte_eal_alarm_callback cb_fn; + void *cb_arg; + volatile uint8_t executing; + volatile pthread_t executing_id; +}; + +static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER(); +static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER; + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static int handler_registered = 0; +static void eal_alarm_callback(void *arg); + +int +rte_eal_alarm_init(void) +{ + intr_handle.type = RTE_INTR_HANDLE_ALARM; + /* create a timerfd file descriptor */ + intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK); + if (intr_handle.fd == -1) + goto error; + + return 0; + +error: + rte_errno = errno; + return -1; +} + +static void +eal_alarm_callback(void *arg __rte_unused) +{ + struct timespec now; + struct alarm_entry *ap; + + rte_spinlock_lock(&alarm_list_lk); + while ((ap = LIST_FIRST(&alarm_list)) !=NULL && + clock_gettime(CLOCK_TYPE_ID, &now) == 0 && + (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec && + (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) { + ap->executing = 1; + ap->executing_id = pthread_self(); + rte_spinlock_unlock(&alarm_list_lk); + + ap->cb_fn(ap->cb_arg); + + rte_spinlock_lock(&alarm_list_lk); + + LIST_REMOVE(ap, next); + free(ap); + } + + if (!LIST_EMPTY(&alarm_list)) { + struct itimerspec atime = { .it_interval = { 0, 0 } }; + + ap = LIST_FIRST(&alarm_list); + atime.it_value.tv_sec = ap->time.tv_sec; + atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US; + /* perform borrow for subtraction if necessary */ + if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US)) + atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US; + + atime.it_value.tv_sec -= now.tv_sec; + atime.it_value.tv_nsec -= now.tv_nsec; + timerfd_settime(intr_handle.fd, 0, &atime, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); +} + +int +rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct timespec now; + int ret = 0; + struct alarm_entry *ap, *new_alarm; + + /* Check parameters, including that us won't cause a uint64_t overflow */ + if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL) + return -EINVAL; + + new_alarm = calloc(1, sizeof(*new_alarm)); + if (new_alarm == NULL) + return -ENOMEM; + + /* use current time to calculate absolute time of alarm */ + clock_gettime(CLOCK_TYPE_ID, &now); + + new_alarm->cb_fn = cb_fn; + new_alarm->cb_arg = cb_arg; + new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S; + new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S); + + rte_spinlock_lock(&alarm_list_lk); + if (!handler_registered) { + /* registration can fail, callback can be registered later */ + if (rte_intr_callback_register(&intr_handle, + eal_alarm_callback, NULL) == 0) + handler_registered = 1; + } + + if (LIST_EMPTY(&alarm_list)) + LIST_INSERT_HEAD(&alarm_list, new_alarm, next); + else { + LIST_FOREACH(ap, &alarm_list, next) { + if (ap->time.tv_sec > new_alarm->time.tv_sec || + (ap->time.tv_sec == new_alarm->time.tv_sec && + ap->time.tv_usec > new_alarm->time.tv_usec)){ + LIST_INSERT_BEFORE(ap, new_alarm, next); + break; + } + if (LIST_NEXT(ap, next) == NULL) { + LIST_INSERT_AFTER(ap, new_alarm, next); + break; + } + } + } + + if (LIST_FIRST(&alarm_list) == new_alarm) { + struct itimerspec alarm_time = { + .it_interval = {0, 0}, + .it_value = { + .tv_sec = us / US_PER_S, + .tv_nsec = (us % US_PER_S) * NS_PER_US, + }, + }; + ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL); + } + rte_spinlock_unlock(&alarm_list_lk); + + return ret; +} + +int +rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg) +{ + struct alarm_entry *ap, *ap_prev; + int count = 0; + int err = 0; + int executing; + + if (!cb_fn) { + rte_errno = EINVAL; + return -1; + } + + do { + executing = 0; + rte_spinlock_lock(&alarm_list_lk); + /* remove any matches at the start of the list */ + while ((ap = LIST_FIRST(&alarm_list)) != NULL && + cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + } else { + /* If calling from other context, mark that alarm is executing + * so loop can spin till it finish. Otherwise we are trying to + * cancel our self - mark it by EINPROGRESS */ + if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + + break; + } + } + ap_prev = ap; + + /* now go through list, removing entries not at start */ + LIST_FOREACH(ap, &alarm_list, next) { + /* this won't be true first time through */ + if (cb_fn == ap->cb_fn && + (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) { + + if (ap->executing == 0) { + LIST_REMOVE(ap, next); + free(ap); + count++; + ap = ap_prev; + } else if (pthread_equal(ap->executing_id, pthread_self()) == 0) + executing++; + else + err = EINPROGRESS; + } + ap_prev = ap; + } + rte_spinlock_unlock(&alarm_list_lk); + } while (executing != 0); + + if (count == 0 && err == 0) + rte_errno = ENOENT; + else if (err) + rte_errno = err; + + return count; +} diff --git a/lib/librte_eal/linux/eal_cpuflags.c b/lib/librte_eal/linux/eal_cpuflags.c new file mode 100644 index 0000000000..d38296e1e5 --- /dev/null +++ b/lib/librte_eal/linux/eal_cpuflags.c @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2018 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include + +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include +#define HAS_AUXV 1 +#endif +#endif + +#include + +#ifndef HAS_AUXV +static unsigned long +getauxval(unsigned long type __rte_unused) +{ + errno = ENOTSUP; + return 0; +} +#endif + +#ifdef RTE_ARCH_64 +typedef Elf64_auxv_t Internal_Elfx_auxv_t; +#else +typedef Elf32_auxv_t Internal_Elfx_auxv_t; +#endif + +/** + * Provides a method for retrieving values from the auxiliary vector and + * possibly running a string comparison. + * + * @return Always returns a result. When the result is 0, check errno + * to see if an error occurred during processing. + */ +static unsigned long +_rte_cpu_getauxval(unsigned long type, const char *str) +{ + unsigned long val; + + errno = 0; + val = getauxval(type); + + if (!val && (errno == ENOTSUP || errno == ENOENT)) { + int auxv_fd = open("/proc/self/auxv", O_RDONLY); + Internal_Elfx_auxv_t auxv; + + if (auxv_fd == -1) + return 0; + + errno = ENOENT; + while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) { + if (auxv.a_type == type) { + errno = 0; + val = auxv.a_un.a_val; + if (str) + val = strcmp((const char *)val, str); + break; + } + } + close(auxv_fd); + } + + return val; +} + +unsigned long +rte_cpu_getauxval(unsigned long type) +{ + return _rte_cpu_getauxval(type, NULL); +} + +int +rte_cpu_strcmp_auxval(unsigned long type, const char *str) +{ + return _rte_cpu_getauxval(type, str); +} diff --git a/lib/librte_eal/linux/eal_debug.c b/lib/librte_eal/linux/eal_debug.c new file mode 100644 index 0000000000..5d92500bf5 --- /dev/null +++ b/lib/librte_eal/linux/eal_debug.c @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifdef RTE_BACKTRACE +#include +#endif +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BACKTRACE_SIZE 256 + +/* dump the stack of the calling core */ +void rte_dump_stack(void) +{ +#ifdef RTE_BACKTRACE + void *func[BACKTRACE_SIZE]; + char **symb = NULL; + int size; + + size = backtrace(func, BACKTRACE_SIZE); + symb = backtrace_symbols(func, size); + + if (symb == NULL) + return; + + while (size > 0) { + rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL, + "%d: [%s]\n", size, symb[size - 1]); + size --; + } + + free(symb); +#endif /* RTE_BACKTRACE */ +} + +/* not implemented in this environment */ +void rte_dump_registers(void) +{ + return; +} + +/* call abort(), it will generate a coredump if enabled */ +void __rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + rte_dump_stack(); + rte_dump_registers(); + abort(); +} + +/* + * Like rte_panic this terminates the application. However, no traceback is + * provided and no core-dump is generated. + */ +void +rte_exit(int exit_code, const char *format, ...) +{ + va_list ap; + + if (exit_code != 0) + RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n" + " Cause: ", exit_code); + + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + +#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR + if (rte_eal_cleanup() != 0) + RTE_LOG(CRIT, EAL, + "EAL could not release all resources\n"); + exit(exit_code); +#else + rte_dump_stack(); + rte_dump_registers(); + abort(); +#endif +} diff --git a/lib/librte_eal/linux/eal_dev.c b/lib/librte_eal/linux/eal_dev.c new file mode 100644 index 0000000000..83c9cd6607 --- /dev/null +++ b/lib/librte_eal/linux/eal_dev.c @@ -0,0 +1,396 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +static struct rte_intr_handle intr_handle = {.fd = -1 }; +static bool monitor_started; +static bool hotplug_handle; + +#define EAL_UEV_MSG_LEN 4096 +#define EAL_UEV_MSG_ELEM_LEN 128 + +/* + * spinlock for device hot-unplug failure handling. If it try to access bus or + * device, such as handle sigbus on bus or handle memory failure for device + * just need to use this lock. It could protect the bus and the device to avoid + * race condition. + */ +static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER; + +static struct sigaction sigbus_action_old; + +static int sigbus_need_recover; + +static void dev_uev_handler(__rte_unused void *param); + +/* identify the system layer which reports this event. */ +enum eal_dev_event_subsystem { + EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */ + EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */ + EAL_DEV_EVENT_SUBSYSTEM_MAX +}; + +static void +sigbus_action_recover(void) +{ + if (sigbus_need_recover) { + sigaction(SIGBUS, &sigbus_action_old, NULL); + sigbus_need_recover = 0; + } +} + +static void sigbus_handler(int signum, siginfo_t *info, + void *ctx __rte_unused) +{ + int ret; + + RTE_LOG(DEBUG, EAL, "Thread catch SIGBUS, fault address:%p\n", + info->si_addr); + + rte_spinlock_lock(&failure_handle_lock); + ret = rte_bus_sigbus_handler(info->si_addr); + rte_spinlock_unlock(&failure_handle_lock); + if (ret == -1) { + rte_exit(EXIT_FAILURE, + "Failed to handle SIGBUS for hot-unplug, " + "(rte_errno: %s)!", strerror(rte_errno)); + } else if (ret == 1) { + if (sigbus_action_old.sa_flags == SA_SIGINFO + && sigbus_action_old.sa_sigaction) { + (*(sigbus_action_old.sa_sigaction))(signum, + info, ctx); + } else if (sigbus_action_old.sa_flags != SA_SIGINFO + && sigbus_action_old.sa_handler) { + (*(sigbus_action_old.sa_handler))(signum); + } else { + rte_exit(EXIT_FAILURE, + "Failed to handle generic SIGBUS!"); + } + } + + RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n"); +} + +static int cmp_dev_name(const struct rte_device *dev, + const void *_name) +{ + const char *name = _name; + + return strcmp(dev->name, name); +} + +static int +dev_uev_socket_fd_create(void) +{ + struct sockaddr_nl addr; + int ret; + + intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | + SOCK_NONBLOCK, + NETLINK_KOBJECT_UEVENT); + if (intr_handle.fd < 0) { + RTE_LOG(ERR, EAL, "create uevent fd failed.\n"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.nl_family = AF_NETLINK; + addr.nl_pid = 0; + addr.nl_groups = 0xffffffff; + + ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr)); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n"); + goto err; + } + + return 0; +err: + close(intr_handle.fd); + intr_handle.fd = -1; + return ret; +} + +static int +dev_uev_parse(const char *buf, struct rte_dev_event *event, int length) +{ + char action[EAL_UEV_MSG_ELEM_LEN]; + char subsystem[EAL_UEV_MSG_ELEM_LEN]; + char pci_slot_name[EAL_UEV_MSG_ELEM_LEN]; + int i = 0; + + memset(action, 0, EAL_UEV_MSG_ELEM_LEN); + memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN); + memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN); + + while (i < length) { + for (; i < length; i++) { + if (*buf) + break; + buf++; + } + /** + * check device uevent from kernel side, no need to check + * uevent from udev. + */ + if (!strncmp(buf, "libudev", 7)) { + buf += 7; + i += 7; + return -1; + } + if (!strncmp(buf, "ACTION=", 7)) { + buf += 7; + i += 7; + strlcpy(action, buf, sizeof(action)); + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { + buf += 10; + i += 10; + strlcpy(subsystem, buf, sizeof(subsystem)); + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { + buf += 14; + i += 14; + strlcpy(pci_slot_name, buf, sizeof(subsystem)); + event->devname = strdup(pci_slot_name); + } + for (; i < length; i++) { + if (*buf == '\0') + break; + buf++; + } + } + + /* parse the subsystem layer */ + if (!strncmp(subsystem, "uio", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO; + else if (!strncmp(subsystem, "pci", 3)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI; + else if (!strncmp(subsystem, "vfio", 4)) + event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO; + else + return -1; + + /* parse the action type */ + if (!strncmp(action, "add", 3)) + event->type = RTE_DEV_EVENT_ADD; + else if (!strncmp(action, "remove", 6)) + event->type = RTE_DEV_EVENT_REMOVE; + else + return -1; + return 0; +} + +static void +dev_delayed_unregister(void *param) +{ + rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param); + close(intr_handle.fd); + intr_handle.fd = -1; +} + +static void +dev_uev_handler(__rte_unused void *param) +{ + struct rte_dev_event uevent; + int ret; + char buf[EAL_UEV_MSG_LEN]; + struct rte_bus *bus; + struct rte_device *dev; + const char *busname = ""; + + memset(&uevent, 0, sizeof(struct rte_dev_event)); + memset(buf, 0, EAL_UEV_MSG_LEN); + + ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT); + if (ret < 0 && errno == EAGAIN) + return; + else if (ret <= 0) { + /* connection is closed or broken, can not up again. */ + RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n"); + rte_eal_alarm_set(1, dev_delayed_unregister, NULL); + return; + } + + ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "It is not an valid event " + "that need to be handle.\n"); + return; + } + + RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n", + uevent.devname, uevent.type, uevent.subsystem); + + switch (uevent.subsystem) { + case EAL_DEV_EVENT_SUBSYSTEM_PCI: + case EAL_DEV_EVENT_SUBSYSTEM_UIO: + busname = "pci"; + break; + default: + break; + } + + if (uevent.devname) { + if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) { + rte_spinlock_lock(&failure_handle_lock); + bus = rte_bus_find_by_name(busname); + if (bus == NULL) { + RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n", + busname); + goto failure_handle_err; + } + + dev = bus->find_device(NULL, cmp_dev_name, + uevent.devname); + if (dev == NULL) { + RTE_LOG(ERR, EAL, "Cannot find device (%s) on " + "bus (%s)\n", uevent.devname, busname); + goto failure_handle_err; + } + + ret = bus->hot_unplug_handler(dev); + if (ret) { + RTE_LOG(ERR, EAL, "Can not handle hot-unplug " + "for device (%s)\n", dev->name); + } + rte_spinlock_unlock(&failure_handle_lock); + } + rte_dev_event_callback_process(uevent.devname, uevent.type); + } + + return; + +failure_handle_err: + rte_spinlock_unlock(&failure_handle_lock); +} + +int +rte_dev_event_monitor_start(void) +{ + int ret; + + if (monitor_started) + return 0; + + ret = dev_uev_socket_fd_create(); + if (ret) { + RTE_LOG(ERR, EAL, "error create device event fd.\n"); + return -1; + } + + intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT; + ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL); + + if (ret) { + RTE_LOG(ERR, EAL, "fail to register uevent callback.\n"); + return -1; + } + + monitor_started = true; + + return 0; +} + +int +rte_dev_event_monitor_stop(void) +{ + int ret; + + if (!monitor_started) + return 0; + + ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler, + (void *)-1); + if (ret < 0) { + RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n"); + return ret; + } + + close(intr_handle.fd); + intr_handle.fd = -1; + monitor_started = false; + + return 0; +} + +int +dev_sigbus_handler_register(void) +{ + sigset_t mask; + struct sigaction action; + + rte_errno = 0; + + if (sigbus_need_recover) + return 0; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = SA_SIGINFO; + action.sa_mask = mask; + action.sa_sigaction = sigbus_handler; + sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old); + + return rte_errno; +} + +int +dev_sigbus_handler_unregister(void) +{ + rte_errno = 0; + + sigbus_action_recover(); + + return rte_errno; +} + +int +rte_dev_hotplug_handle_enable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_register(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to register sigbus handler for devices.\n"); + + hotplug_handle = true; + + return ret; +} + +int +rte_dev_hotplug_handle_disable(void) +{ + int ret = 0; + + ret = dev_sigbus_handler_unregister(); + if (ret < 0) + RTE_LOG(ERR, EAL, + "fail to unregister sigbus handler for devices.\n"); + + hotplug_handle = false; + + return ret; +} diff --git a/lib/librte_eal/linux/eal_hugepage_info.c b/lib/librte_eal/linux/eal_hugepage_info.c new file mode 100644 index 0000000000..91a4fede76 --- /dev/null +++ b/lib/librte_eal/linux/eal_hugepage_info.c @@ -0,0 +1,547 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* for hugetlb-related flags */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "rte_string_fns.h" +#include "eal_internal_cfg.h" +#include "eal_hugepages.h" +#include "eal_filesystem.h" + +static const char sys_dir_path[] = "/sys/kernel/mm/hugepages"; +static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node"; + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +map_shared_memory(const char *filename, const size_t mem_size, int flags) +{ + void *retval; + int fd = open(filename, flags, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + close(fd); + return retval; +} + +static void * +open_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR); +} + +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT); +} + +static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s/%s", + sys_dir_path, subdir, file); + return eal_parse_sysfs_value(path, val); +} + +/* this function is only called from eal_hugepage_info_init which itself + * is only called from a primary process */ +static uint32_t +get_num_hugepages(const char *subdir) +{ + unsigned long resv_pages, num_pages, over_pages, surplus_pages; + const char *nr_hp_file = "free_hugepages"; + const char *nr_rsvd_file = "resv_hugepages"; + const char *nr_over_file = "nr_overcommit_hugepages"; + const char *nr_splus_file = "surplus_hugepages"; + + /* first, check how many reserved pages kernel reports */ + if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0) + return 0; + + if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0) + return 0; + + if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0) + over_pages = 0; + + if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0) + surplus_pages = 0; + + /* adjust num_pages */ + if (num_pages >= resv_pages) + num_pages -= resv_pages; + else if (resv_pages) + num_pages = 0; + + if (over_pages >= surplus_pages) + over_pages -= surplus_pages; + else + over_pages = 0; + + if (num_pages == 0 && over_pages == 0) + RTE_LOG(WARNING, EAL, "No available hugepages reported in %s\n", + subdir); + + num_pages += over_pages; + if (num_pages < over_pages) /* overflow */ + num_pages = UINT32_MAX; + + /* we want to return a uint32_t and more than this looks suspicious + * anyway ... */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint32_t +get_num_hugepages_on_node(const char *subdir, unsigned int socket) +{ + char path[PATH_MAX], socketpath[PATH_MAX]; + DIR *socketdir; + unsigned long num_pages = 0; + const char *nr_hp_file = "free_hugepages"; + + snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages", + sys_pages_numa_dir_path, socket); + + socketdir = opendir(socketpath); + if (socketdir) { + /* Keep calm and carry on */ + closedir(socketdir); + } else { + /* Can't find socket dir, so ignore it */ + return 0; + } + + snprintf(path, sizeof(path), "%s/%s/%s", + socketpath, subdir, nr_hp_file); + if (eal_parse_sysfs_value(path, &num_pages) < 0) + return 0; + + if (num_pages == 0) + RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n", + subdir); + + /* + * we want to return a uint32_t and more than this looks suspicious + * anyway ... + */ + if (num_pages > UINT32_MAX) + num_pages = UINT32_MAX; + + return num_pages; +} + +static uint64_t +get_default_hp_size(void) +{ + const char proc_meminfo[] = "/proc/meminfo"; + const char str_hugepagesz[] = "Hugepagesize:"; + unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1; + char buffer[256]; + unsigned long long size = 0; + + FILE *fd = fopen(proc_meminfo, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_meminfo); + while(fgets(buffer, sizeof(buffer), fd)){ + if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){ + size = rte_str_to_size(&buffer[hugepagesz_len]); + break; + } + } + fclose(fd); + if (size == 0) + rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo); + return size; +} + +static int +get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len) +{ + enum proc_mount_fieldnames { + DEVICE = 0, + MOUNTPT, + FSTYPE, + OPTIONS, + _FIELDNAME_MAX + }; + static uint64_t default_size = 0; + const char proc_mounts[] = "/proc/mounts"; + const char hugetlbfs_str[] = "hugetlbfs"; + const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1; + const char pagesize_opt[] = "pagesize="; + const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1; + const char split_tok = ' '; + char *splitstr[_FIELDNAME_MAX]; + char buf[BUFSIZ]; + int retval = -1; + + FILE *fd = fopen(proc_mounts, "r"); + if (fd == NULL) + rte_panic("Cannot open %s\n", proc_mounts); + + if (default_size == 0) + default_size = get_default_hp_size(); + + while (fgets(buf, sizeof(buf), fd)){ + if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX, + split_tok) != _FIELDNAME_MAX) { + RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts); + break; /* return NULL */ + } + + /* we have a specified --huge-dir option, only examine that dir */ + if (internal_config.hugepage_dir != NULL && + strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0) + continue; + + if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){ + const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt); + + /* if no explicit page size, the default page size is compared */ + if (pagesz_str == NULL){ + if (hugepage_sz == default_size){ + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + /* there is an explicit page size, so check it */ + else { + uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]); + if (pagesz == hugepage_sz) { + strlcpy(hugedir, splitstr[MOUNTPT], len); + retval = 0; + break; + } + } + } /* end if strncmp hugetlbfs */ + } /* end while fgets */ + + fclose(fd); + return retval; +} + +/* + * Clear the hugepage directory of whatever hugepage files + * there are. Checks if the file is locked (i.e. + * if it's in use by another DPDK process). + */ +static int +clear_hugedir(const char * hugedir) +{ + DIR *dir; + struct dirent *dirent; + int dir_fd, fd, lck_result; + const char filter[] = "*map_*"; /* matches hugepage files */ + + /* open directory */ + dir = opendir(hugedir); + if (!dir) { + RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n", + hugedir); + goto error; + } + dir_fd = dirfd(dir); + + dirent = readdir(dir); + if (!dirent) { + RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n", + hugedir); + goto error; + } + + while(dirent != NULL){ + /* skip files that don't match the hugepage pattern */ + if (fnmatch(filter, dirent->d_name, 0) > 0) { + dirent = readdir(dir); + continue; + } + + /* try and lock the file */ + fd = openat(dir_fd, dirent->d_name, O_RDONLY); + + /* skip to next file */ + if (fd == -1) { + dirent = readdir(dir); + continue; + } + + /* non-blocking lock */ + lck_result = flock(fd, LOCK_EX | LOCK_NB); + + /* if lock succeeds, remove the file */ + if (lck_result != -1) + unlinkat(dir_fd, dirent->d_name, 0); + close (fd); + dirent = readdir(dir); + } + + closedir(dir); + return 0; + +error: + if (dir) + closedir(dir); + + RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n", + strerror(errno)); + + return -1; +} + +static int +compare_hpi(const void *a, const void *b) +{ + const struct hugepage_info *hpi_a = a; + const struct hugepage_info *hpi_b = b; + + return hpi_b->hugepage_sz - hpi_a->hugepage_sz; +} + +static void +calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent) +{ + uint64_t total_pages = 0; + unsigned int i; + + /* + * first, try to put all hugepages into relevant sockets, but + * if first attempts fails, fall back to collecting all pages + * in one socket and sorting them later + */ + total_pages = 0; + /* we also don't want to do this for legacy init */ + if (!internal_config.legacy_mem) + for (i = 0; i < rte_socket_count(); i++) { + int socket = rte_socket_id_by_idx(i); + unsigned int num_pages = + get_num_hugepages_on_node( + dirent->d_name, socket); + hpi->num_pages[socket] = num_pages; + total_pages += num_pages; + } + /* + * we failed to sort memory from the get go, so fall + * back to old way + */ + if (total_pages == 0) { + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_64 + /* for 32-bit systems, limit number of hugepages to + * 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + } +} + +static int +hugepage_info_init(void) +{ const char dirent_start_text[] = "hugepages-"; + const size_t dirent_start_len = sizeof(dirent_start_text) - 1; + unsigned int i, num_sizes = 0; + DIR *dir; + struct dirent *dirent; + + dir = opendir(sys_dir_path); + if (dir == NULL) { + RTE_LOG(ERR, EAL, + "Cannot open directory %s to read system hugepage info\n", + sys_dir_path); + return -1; + } + + for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) { + struct hugepage_info *hpi; + + if (strncmp(dirent->d_name, dirent_start_text, + dirent_start_len) != 0) + continue; + + if (num_sizes >= MAX_HUGEPAGE_SIZES) + break; + + hpi = &internal_config.hugepage_info[num_sizes]; + hpi->hugepage_sz = + rte_str_to_size(&dirent->d_name[dirent_start_len]); + + /* first, check if we have a mountpoint */ + if (get_hugepage_dir(hpi->hugepage_sz, + hpi->hugedir, sizeof(hpi->hugedir)) < 0) { + uint32_t num_pages; + + num_pages = get_num_hugepages(dirent->d_name); + if (num_pages > 0) + RTE_LOG(NOTICE, EAL, + "%" PRIu32 " hugepages of size " + "%" PRIu64 " reserved, but no mounted " + "hugetlbfs found for that size\n", + num_pages, hpi->hugepage_sz); + /* if we have kernel support for reserving hugepages + * through mmap, and we're in in-memory mode, treat this + * page size as valid. we cannot be in legacy mode at + * this point because we've checked this earlier in the + * init process. + */ +#ifdef MAP_HUGE_SHIFT + if (internal_config.in_memory) { + RTE_LOG(DEBUG, EAL, "In-memory mode enabled, " + "hugepages of size %" PRIu64 " bytes " + "will be allocated anonymously\n", + hpi->hugepage_sz); + calc_num_pages(hpi, dirent); + num_sizes++; + } +#endif + continue; + } + + /* try to obtain a writelock */ + hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY); + + /* if blocking lock failed */ + if (flock(hpi->lock_descriptor, LOCK_EX) == -1) { + RTE_LOG(CRIT, EAL, + "Failed to lock hugepage directory!\n"); + break; + } + /* clear out the hugepages dir from unused pages */ + if (clear_hugedir(hpi->hugedir) == -1) + break; + + calc_num_pages(hpi, dirent); + + num_sizes++; + } + closedir(dir); + + /* something went wrong, and we broke from the for loop above */ + if (dirent != NULL) + return -1; + + internal_config.num_hugepage_sizes = num_sizes; + + /* sort the page directory entries by size, largest to smallest */ + qsort(&internal_config.hugepage_info[0], num_sizes, + sizeof(internal_config.hugepage_info[0]), compare_hpi); + + /* now we have all info, check we have at least one valid size */ + for (i = 0; i < num_sizes; i++) { + /* pages may no longer all be on socket 0, so check all */ + unsigned int j, num_pages = 0; + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + num_pages += hpi->num_pages[j]; + if (num_pages > 0) + return 0; + } + + /* no valid hugepage mounts available, return error */ + return -1; +} + +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ +int +eal_hugepage_info_init(void) +{ + struct hugepage_info *hpi, *tmp_hpi; + unsigned int i; + + if (hugepage_info_init() < 0) + return -1; + + /* for no shared files mode, we're done */ + if (internal_config.no_shconf) + return 0; + + hpi = &internal_config.hugepage_info[0]; + + tmp_hpi = create_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + return -1; + } + + memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info)); + + /* we've copied file descriptors along with everything else, but they + * will be invalid in secondary process, so overwrite them + */ + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + struct hugepage_info *tmp = &tmp_hpi[i]; + tmp->lock_descriptor = -1; + } + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} + +int eal_hugepage_info_read(void) +{ + struct hugepage_info *hpi = &internal_config.hugepage_info[0]; + struct hugepage_info *tmp_hpi; + + tmp_hpi = open_shared_memory(eal_hugepage_info_path(), + sizeof(internal_config.hugepage_info)); + if (tmp_hpi == NULL) { + RTE_LOG(ERR, EAL, "Failed to open shared memory!\n"); + return -1; + } + + memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info)); + + if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) { + RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n"); + return -1; + } + return 0; +} diff --git a/lib/librte_eal/linux/eal_interrupts.c b/lib/librte_eal/linux/eal_interrupts.c new file mode 100644 index 0000000000..cb8e107098 --- /dev/null +++ b/lib/librte_eal/linux/eal_interrupts.c @@ -0,0 +1,1495 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_vfio.h" +#include "eal_thread.h" + +#define EAL_INTR_EPOLL_WAIT_FOREVER (-1) +#define NB_OTHER_INTR 1 + +static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */ + +/** + * union for pipe fds. + */ +union intr_pipefds{ + struct { + int pipefd[2]; + }; + struct { + int readfd; + int writefd; + }; +}; + +/** + * union buffer for reading on different devices + */ +union rte_intr_read_buffer { + int uio_intr_count; /* for uio device */ +#ifdef VFIO_PRESENT + uint64_t vfio_intr_count; /* for vfio device */ +#endif + uint64_t timerfd_num; /* for timerfd */ + char charbuf[16]; /* for others */ +}; + +TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback); +TAILQ_HEAD(rte_intr_source_list, rte_intr_source); + +struct rte_intr_callback { + TAILQ_ENTRY(rte_intr_callback) next; + rte_intr_callback_fn cb_fn; /**< callback address */ + void *cb_arg; /**< parameter for callback */ + uint8_t pending_delete; /**< delete after callback is called */ + rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */ +}; + +struct rte_intr_source { + TAILQ_ENTRY(rte_intr_source) next; + struct rte_intr_handle intr_handle; /**< interrupt handle */ + struct rte_intr_cb_list callbacks; /**< user callbacks */ + uint32_t active; +}; + +/* global spinlock for interrupt data operation */ +static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER; + +/* union buffer for pipe read/write */ +static union intr_pipefds intr_pipe; + +/* interrupt sources list */ +static struct rte_intr_source_list intr_sources; + +/* interrupt handling thread */ +static pthread_t intr_thread; + +/* VFIO interrupts */ +#ifdef VFIO_PRESENT + +#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int)) +/* irq set buffer length for queue interrupts and LSC interrupt */ +#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \ + sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1)) + +/* enable legacy (INTx) interrupts */ +static int +vfio_enable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + /* enable INTx */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* unmask INTx after enabling */ + memset(irq_set, 0, len); + len = sizeof(struct vfio_irq_set); + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable legacy (INTx) interrupts */ +static int +vfio_disable_intx(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + /* mask interrupts before disabling */ + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + /* disable INTx*/ + memset(irq_set, 0, len); + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, + "Error disabling INTx interrupts for fd %d\n", intr_handle->fd); + return -1; + } + return 0; +} + +/* unmask/ack legacy (INTx) interrupts */ +static int +vfio_ack_intx(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set irq_set; + + /* unmask INTx */ + memset(&irq_set, 0, sizeof(irq_set)); + irq_set.argsz = sizeof(irq_set); + irq_set.count = 1; + irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + irq_set.index = VFIO_PCI_INTX_IRQ_INDEX; + irq_set.start = 0; + + if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) { + RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* enable MSI interrupts */ +static int +vfio_enable_msi(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + return 0; +} + +/* disable MSI interrupts */ +static int +vfio_disable_msi(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSI_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +/* enable MSI-X interrupts */ +static int +vfio_enable_msix(const struct rte_intr_handle *intr_handle) { + int len, ret; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */ + irq_set->count = intr_handle->max_intr ? + (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ? + RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + /* INTR vector offset 0 reserve for non-efds mapping */ + fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd; + memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds, + sizeof(*intr_handle->efds) * intr_handle->nb_efd); + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable MSI-X interrupts */ +static int +vfio_disable_msix(const struct rte_intr_handle *intr_handle) { + struct vfio_irq_set *irq_set; + char irq_set_buf[MSIX_IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, + "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd); + + return ret; +} + +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE +/* enable req notifier */ +static int +vfio_enable_req(const struct rte_intr_handle *intr_handle) +{ + int len, ret; + char irq_set_buf[IRQ_SET_BUF_LEN]; + struct vfio_irq_set *irq_set; + int *fd_ptr; + + len = sizeof(irq_set_buf); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 1; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + fd_ptr = (int *) &irq_set->data; + *fd_ptr = intr_handle->fd; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) { + RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/* disable req notifier */ +static int +vfio_disable_req(const struct rte_intr_handle *intr_handle) +{ + struct vfio_irq_set *irq_set; + char irq_set_buf[IRQ_SET_BUF_LEN]; + int len, ret; + + len = sizeof(struct vfio_irq_set); + + irq_set = (struct vfio_irq_set *) irq_set_buf; + irq_set->argsz = len; + irq_set->count = 0; + irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_REQ_IRQ_INDEX; + irq_set->start = 0; + + ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set); + + if (ret) + RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n", + intr_handle->fd); + + return ret; +} +#endif +#endif + +static int +uio_intx_intr_disable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* disable interrupts */ + command_high |= 0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intx_intr_enable(const struct rte_intr_handle *intr_handle) +{ + unsigned char command_high; + + /* use UIO config file descriptor for uio_pci_generic */ + if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error reading interrupts status for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + /* enable interrupts */ + command_high &= ~0x4; + if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d\n", + intr_handle->uio_cfg_fd); + return -1; + } + + return 0; +} + +static int +uio_intr_disable(const struct rte_intr_handle *intr_handle) +{ + const int value = 0; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error disabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +static int +uio_intr_enable(const struct rte_intr_handle *intr_handle) +{ + const int value = 1; + + if (write(intr_handle->fd, &value, sizeof(value)) < 0) { + RTE_LOG(ERR, EAL, + "Error enabling interrupts for fd %d (%s)\n", + intr_handle->fd, strerror(errno)); + return -1; + } + return 0; +} + +int +rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) +{ + int ret, wake_thread; + struct rte_intr_source *src; + struct rte_intr_callback *callback; + + wake_thread = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { + RTE_LOG(ERR, EAL, + "Registering with invalid input parameter\n"); + return -EINVAL; + } + + /* allocate a new interrupt callback entity */ + callback = calloc(1, sizeof(*callback)); + if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + return -ENOMEM; + } + callback->cb_fn = cb; + callback->cb_arg = cb_arg; + callback->pending_delete = 0; + callback->ucb_fn = NULL; + + rte_spinlock_lock(&intr_lock); + + /* check if there is at least one callback registered for the fd */ + TAILQ_FOREACH(src, &intr_sources, next) { + if (src->intr_handle.fd == intr_handle->fd) { + /* we had no interrupts for this */ + if (TAILQ_EMPTY(&src->callbacks)) + wake_thread = 1; + + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + ret = 0; + break; + } + } + + /* no existing callbacks for this - add new source */ + if (src == NULL) { + src = calloc(1, sizeof(*src)); + if (src == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + free(callback); + ret = -ENOMEM; + } else { + src->intr_handle = *intr_handle; + TAILQ_INIT(&src->callbacks); + TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + TAILQ_INSERT_TAIL(&intr_sources, src, next); + wake_thread = 1; + ret = 0; + } + } + + rte_spinlock_unlock(&intr_lock); + + /** + * check if need to notify the pipe fd waited by epoll_wait to + * rebuild the wait list. + */ + if (wake_thread) + if (write(intr_pipe.writefd, "1", 1) < 0) + return -EPIPE; + + return ret; +} + +int +rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg, + rte_intr_unregister_callback_fn ucb_fn) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* only usable if the source is active */ + } else if (src->active == 0) { + ret = -EAGAIN; + + } else { + ret = 0; + + /* walk through the callbacks and mark all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + cb->pending_delete = 1; + cb->ucb_fn = ucb_fn; + ret++; + } + } + } + + rte_spinlock_unlock(&intr_lock); + + return ret; +} + +int +rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb_fn, void *cb_arg) +{ + int ret; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + + /* do parameter checking first */ + if (intr_handle == NULL || intr_handle->fd < 0) { + RTE_LOG(ERR, EAL, + "Unregistering with invalid input parameter\n"); + return -EINVAL; + } + + rte_spinlock_lock(&intr_lock); + + /* check if the insterrupt source for the fd is existent */ + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == intr_handle->fd) + break; + + /* No interrupt source registered for the fd */ + if (src == NULL) { + ret = -ENOENT; + + /* interrupt source has some active callbacks right now. */ + } else if (src->active != 0) { + ret = -EAGAIN; + + /* ok to remove. */ + } else { + ret = 0; + + /*walk through the callbacks and remove all that match. */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 || + cb->cb_arg == cb_arg)) { + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + ret++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + } + + rte_spinlock_unlock(&intr_lock); + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + ret = -EPIPE; + } + + return ret; +} + +int +rte_intr_enable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to enable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_enable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_enable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_enable_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_enable_req(intr_handle)) + return -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +/** + * PMD generally calls this function at the end of its IRQ callback. + * Internally, it unmasks the interrupt if possible. + * + * For INTx, unmasking is required as the interrupt is auto-masked prior to + * invoking callback. + * + * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not + * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI, + * this function is no-op. + */ +int +rte_intr_ack(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type) { + /* Both acking and enabling are same for UIO */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_enable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_enable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */ + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + return 0; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_ack_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + return -1; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +int +rte_intr_disable(const struct rte_intr_handle *intr_handle) +{ + if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 0; + + if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0) + return -1; + + switch (intr_handle->type){ + /* write to the uio fd to disable the interrupt */ + case RTE_INTR_HANDLE_UIO: + if (uio_intr_disable(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_UIO_INTX: + if (uio_intx_intr_disable(intr_handle)) + return -1; + break; + /* not used at this moment */ + case RTE_INTR_HANDLE_ALARM: + return -1; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + if (vfio_disable_msix(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_MSI: + if (vfio_disable_msi(intr_handle)) + return -1; + break; + case RTE_INTR_HANDLE_VFIO_LEGACY: + if (vfio_disable_intx(intr_handle)) + return -1; + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + if (vfio_disable_req(intr_handle)) + return -1; + break; +#endif +#endif + /* not used at this moment */ + case RTE_INTR_HANDLE_DEV_EVENT: + return -1; + /* unknown handle type */ + default: + RTE_LOG(ERR, EAL, + "Unknown handle type of fd %d\n", + intr_handle->fd); + return -1; + } + + return 0; +} + +static int +eal_intr_process_interrupts(struct epoll_event *events, int nfds) +{ + bool call = false; + int n, bytes_read, rv; + struct rte_intr_source *src; + struct rte_intr_callback *cb, *next; + union rte_intr_read_buffer buf; + struct rte_intr_callback active_cb; + + for (n = 0; n < nfds; n++) { + + /** + * if the pipe fd is ready to read, return out to + * rebuild the wait list. + */ + if (events[n].data.fd == intr_pipe.readfd){ + int r = read(intr_pipe.readfd, buf.charbuf, + sizeof(buf.charbuf)); + RTE_SET_USED(r); + return -1; + } + rte_spinlock_lock(&intr_lock); + TAILQ_FOREACH(src, &intr_sources, next) + if (src->intr_handle.fd == + events[n].data.fd) + break; + if (src == NULL){ + rte_spinlock_unlock(&intr_lock); + continue; + } + + /* mark this interrupt source as active and release the lock. */ + src->active = 1; + rte_spinlock_unlock(&intr_lock); + + /* set the length to be read dor different handle type */ + switch (src->intr_handle.type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; + case RTE_INTR_HANDLE_ALARM: + bytes_read = sizeof(buf.timerfd_num); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#ifdef HAVE_VFIO_DEV_REQ_INTERFACE + case RTE_INTR_HANDLE_VFIO_REQ: + bytes_read = 0; + call = true; + break; +#endif +#endif + case RTE_INTR_HANDLE_VDEV: + case RTE_INTR_HANDLE_EXT: + bytes_read = 0; + call = true; + break; + case RTE_INTR_HANDLE_DEV_EVENT: + bytes_read = 0; + call = true; + break; + default: + bytes_read = 1; + break; + } + + if (bytes_read > 0) { + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + bytes_read = read(events[n].data.fd, &buf, bytes_read); + if (bytes_read < 0) { + if (errno == EINTR || errno == EWOULDBLOCK) + continue; + + RTE_LOG(ERR, EAL, "Error reading from file " + "descriptor %d: %s\n", + events[n].data.fd, + strerror(errno)); + /* + * The device is unplugged or buggy, remove + * it as an interrupt source and return to + * force the wait list to be rebuilt. + */ + rte_spinlock_lock(&intr_lock); + TAILQ_REMOVE(&intr_sources, src, next); + rte_spinlock_unlock(&intr_lock); + + for (cb = TAILQ_FIRST(&src->callbacks); cb; + cb = next) { + next = TAILQ_NEXT(cb, next); + TAILQ_REMOVE(&src->callbacks, cb, next); + free(cb); + } + free(src); + return -1; + } else if (bytes_read == 0) + RTE_LOG(ERR, EAL, "Read nothing from file " + "descriptor %d\n", events[n].data.fd); + else + call = true; + } + + /* grab a lock, again to call callbacks and update status. */ + rte_spinlock_lock(&intr_lock); + + if (call) { + + /* Finally, call all callbacks. */ + TAILQ_FOREACH(cb, &src->callbacks, next) { + + /* make a copy and unlock. */ + active_cb = *cb; + rte_spinlock_unlock(&intr_lock); + + /* call the actual callback */ + active_cb.cb_fn(active_cb.cb_arg); + + /*get the lock back. */ + rte_spinlock_lock(&intr_lock); + } + } + /* we done with that interrupt source, release it. */ + src->active = 0; + + rv = 0; + + /* check if any callback are supposed to be removed */ + for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) { + next = TAILQ_NEXT(cb, next); + if (cb->pending_delete) { + TAILQ_REMOVE(&src->callbacks, cb, next); + if (cb->ucb_fn) + cb->ucb_fn(&src->intr_handle, cb->cb_arg); + free(cb); + rv++; + } + } + + /* all callbacks for that source are removed. */ + if (TAILQ_EMPTY(&src->callbacks)) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); + } + + /* notify the pipe fd waited by epoll_wait to rebuild the wait list */ + if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) { + rte_spinlock_unlock(&intr_lock); + return -EPIPE; + } + + rte_spinlock_unlock(&intr_lock); + } + + return 0; +} + +/** + * It handles all the interrupts. + * + * @param pfd + * epoll file descriptor. + * @param totalfds + * The number of file descriptors added in epoll. + * + * @return + * void + */ +static void +eal_intr_handle_interrupts(int pfd, unsigned totalfds) +{ + struct epoll_event events[totalfds]; + int nfds = 0; + + for(;;) { + nfds = epoll_wait(pfd, events, totalfds, + EAL_INTR_EPOLL_WAIT_FOREVER); + /* epoll_wait fail */ + if (nfds < 0) { + if (errno == EINTR) + continue; + RTE_LOG(ERR, EAL, + "epoll_wait returns with fail\n"); + return; + } + /* epoll_wait timeout, will never happens here */ + else if (nfds == 0) + continue; + /* epoll_wait has at least one fd ready to read */ + if (eal_intr_process_interrupts(events, nfds) < 0) + return; + } +} + +/** + * It builds/rebuilds up the epoll file descriptor with all the + * file descriptors being waited on. Then handles the interrupts. + * + * @param arg + * pointer. (unused) + * + * @return + * never return; + */ +static __attribute__((noreturn)) void * +eal_intr_thread_main(__rte_unused void *arg) +{ + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to + * wait on then pass it to the handle_interrupts function + */ + static struct epoll_event pipe_event = { + .events = EPOLLIN | EPOLLPRI, + }; + struct rte_intr_source *src; + unsigned numfds = 0; + + /* create epoll fd */ + int pfd = epoll_create(1); + if (pfd < 0) + rte_panic("Cannot create epoll instance\n"); + + pipe_event.data.fd = intr_pipe.readfd; + /** + * add pipe fd into wait list, this pipe is used to + * rebuild the wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd, + &pipe_event) < 0) { + rte_panic("Error adding fd to %d epoll_ctl, %s\n", + intr_pipe.readfd, strerror(errno)); + } + numfds++; + + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { + struct epoll_event ev; + + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ + memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + + /** + * add all the uio device file descriptor + * into wait list. + */ + if (epoll_ctl(pfd, EPOLL_CTL_ADD, + src->intr_handle.fd, &ev) < 0){ + rte_panic("Error adding fd %d epoll_ctl, %s\n", + src->intr_handle.fd, strerror(errno)); + } + else + numfds++; + } + rte_spinlock_unlock(&intr_lock); + /* serve the interrupt */ + eal_intr_handle_interrupts(pfd, numfds); + + /** + * when we return, we need to rebuild the + * list of fds to monitor. + */ + close(pfd); + } +} + +int +rte_eal_intr_init(void) +{ + int ret = 0; + + /* init the global interrupt source head */ + TAILQ_INIT(&intr_sources); + + /** + * create a pipe which will be waited by epoll and notified to + * rebuild the wait list of epoll. + */ + if (pipe(intr_pipe.pipefd) < 0) { + rte_errno = errno; + return -1; + } + + /* create the host thread to wait/handle the interrupt */ + ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL, + eal_intr_thread_main, NULL); + if (ret != 0) { + rte_errno = -ret; + RTE_LOG(ERR, EAL, + "Failed to create thread for interrupt handling\n"); + } + + return ret; +} + +static void +eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle) +{ + union rte_intr_read_buffer buf; + int bytes_read = 0; + int nbytes; + + switch (intr_handle->type) { + case RTE_INTR_HANDLE_UIO: + case RTE_INTR_HANDLE_UIO_INTX: + bytes_read = sizeof(buf.uio_intr_count); + break; +#ifdef VFIO_PRESENT + case RTE_INTR_HANDLE_VFIO_MSIX: + case RTE_INTR_HANDLE_VFIO_MSI: + case RTE_INTR_HANDLE_VFIO_LEGACY: + bytes_read = sizeof(buf.vfio_intr_count); + break; +#endif + case RTE_INTR_HANDLE_VDEV: + bytes_read = intr_handle->efd_counter_size; + /* For vdev, number of bytes to read is set by driver */ + break; + case RTE_INTR_HANDLE_EXT: + return; + default: + bytes_read = 1; + RTE_LOG(INFO, EAL, "unexpected intr type\n"); + break; + } + + /** + * read out to clear the ready-to-be-read flag + * for epoll_wait. + */ + if (bytes_read == 0) + return; + do { + nbytes = read(fd, &buf, bytes_read); + if (nbytes < 0) { + if (errno == EINTR || errno == EWOULDBLOCK || + errno == EAGAIN) + continue; + RTE_LOG(ERR, EAL, + "Error reading from fd %d: %s\n", + fd, strerror(errno)); + } else if (nbytes == 0) + RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd); + return; + } while (1); +} + +static int +eal_epoll_process_event(struct epoll_event *evs, unsigned int n, + struct rte_epoll_event *events) +{ + unsigned int i, count = 0; + struct rte_epoll_event *rev; + + for (i = 0; i < n; i++) { + rev = evs[i].data.ptr; + if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID, + RTE_EPOLL_EXEC)) + continue; + + events[count].status = RTE_EPOLL_VALID; + events[count].fd = rev->fd; + events[count].epfd = rev->epfd; + events[count].epdata.event = rev->epdata.event; + events[count].epdata.data = rev->epdata.data; + if (rev->epdata.cb_fun) + rev->epdata.cb_fun(rev->fd, + rev->epdata.cb_arg); + + rte_compiler_barrier(); + rev->status = RTE_EPOLL_VALID; + count++; + } + return count; +} + +static inline int +eal_init_tls_epfd(void) +{ + int pfd = epoll_create(255); + + if (pfd < 0) { + RTE_LOG(ERR, EAL, + "Cannot create epoll instance\n"); + return -1; + } + return pfd; +} + +int +rte_intr_tls_epfd(void) +{ + if (RTE_PER_LCORE(_epfd) == -1) + RTE_PER_LCORE(_epfd) = eal_init_tls_epfd(); + + return RTE_PER_LCORE(_epfd); +} + +int +rte_epoll_wait(int epfd, struct rte_epoll_event *events, + int maxevents, int timeout) +{ + struct epoll_event evs[maxevents]; + int rc; + + if (!events) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + while (1) { + rc = epoll_wait(epfd, evs, maxevents, timeout); + if (likely(rc > 0)) { + /* epoll_wait has at least one fd ready to read */ + rc = eal_epoll_process_event(evs, rc, events); + break; + } else if (rc < 0) { + if (errno == EINTR) + continue; + /* epoll_wait fail */ + RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n", + strerror(errno)); + rc = -1; + break; + } else { + /* rc == 0, epoll_wait timed out */ + break; + } + } + + return rc; +} + +static inline void +eal_epoll_data_safe_free(struct rte_epoll_event *ev) +{ + while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID, + RTE_EPOLL_INVALID)) + while (ev->status != RTE_EPOLL_VALID) + rte_pause(); + memset(&ev->epdata, 0, sizeof(ev->epdata)); + ev->fd = -1; + ev->epfd = -1; +} + +int +rte_epoll_ctl(int epfd, int op, int fd, + struct rte_epoll_event *event) +{ + struct epoll_event ev; + + if (!event) { + RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n"); + return -1; + } + + /* using per thread epoll fd */ + if (epfd == RTE_EPOLL_PER_THREAD) + epfd = rte_intr_tls_epfd(); + + if (op == EPOLL_CTL_ADD) { + event->status = RTE_EPOLL_VALID; + event->fd = fd; /* ignore fd in event */ + event->epfd = epfd; + ev.data.ptr = (void *)event; + } + + ev.events = event->epdata.event; + if (epoll_ctl(epfd, op, fd, &ev) < 0) { + RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n", + op, fd, strerror(errno)); + if (op == EPOLL_CTL_ADD) + /* rollback status when CTL_ADD fail */ + event->status = RTE_EPOLL_INVALID; + return -1; + } + + if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID) + eal_epoll_data_safe_free(event); + + return 0; +} + +int +rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd, + int op, unsigned int vec, void *data) +{ + struct rte_epoll_event *rev; + struct rte_epoll_data *epdata; + int epfd_op; + unsigned int efd_idx; + int rc = 0; + + efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ? + (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec; + + if (!intr_handle || intr_handle->nb_efd == 0 || + efd_idx >= intr_handle->nb_efd) { + RTE_LOG(ERR, EAL, "Wrong intr vector number.\n"); + return -EPERM; + } + + switch (op) { + case RTE_INTR_EVENT_ADD: + epfd_op = EPOLL_CTL_ADD; + rev = &intr_handle->elist[efd_idx]; + if (rev->status != RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event already been added.\n"); + return -EEXIST; + } + + /* attach to intr vector fd */ + epdata = &rev->epdata; + epdata->event = EPOLLIN | EPOLLPRI | EPOLLET; + epdata->data = data; + epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr; + epdata->cb_arg = (void *)intr_handle; + rc = rte_epoll_ctl(epfd, epfd_op, + intr_handle->efds[efd_idx], rev); + if (!rc) + RTE_LOG(DEBUG, EAL, + "efd %d associated with vec %d added on epfd %d" + "\n", rev->fd, vec, epfd); + else + rc = -EPERM; + break; + case RTE_INTR_EVENT_DEL: + epfd_op = EPOLL_CTL_DEL; + rev = &intr_handle->elist[efd_idx]; + if (rev->status == RTE_EPOLL_INVALID) { + RTE_LOG(INFO, EAL, "Event does not exist.\n"); + return -EPERM; + } + + rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev); + if (rc) + rc = -EPERM; + break; + default: + RTE_LOG(ERR, EAL, "event op type mismatch\n"); + rc = -EPERM; + } + + return rc; +} + +void +rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + struct rte_epoll_event *rev; + + for (i = 0; i < intr_handle->nb_efd; i++) { + rev = &intr_handle->elist[i]; + if (rev->status == RTE_EPOLL_INVALID) + continue; + if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) { + /* force free if the entry valid */ + eal_epoll_data_safe_free(rev); + rev->status = RTE_EPOLL_INVALID; + } + } +} + +int +rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd) +{ + uint32_t i; + int fd; + uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID); + + assert(nb_efd != 0); + + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) { + for (i = 0; i < n; i++) { + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); + if (fd < 0) { + RTE_LOG(ERR, EAL, + "can't setup eventfd, error %i (%s)\n", + errno, strerror(errno)); + return -errno; + } + intr_handle->efds[i] = fd; + } + intr_handle->nb_efd = n; + intr_handle->max_intr = NB_OTHER_INTR + n; + } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) { + /* only check, initialization would be done in vdev driver.*/ + if (intr_handle->efd_counter_size > + sizeof(union rte_intr_read_buffer)) { + RTE_LOG(ERR, EAL, "the efd_counter_size is oversized"); + return -EINVAL; + } + } else { + intr_handle->efds[0] = intr_handle->fd; + intr_handle->nb_efd = RTE_MIN(nb_efd, 1U); + intr_handle->max_intr = NB_OTHER_INTR; + } + + return 0; +} + +void +rte_intr_efd_disable(struct rte_intr_handle *intr_handle) +{ + uint32_t i; + + rte_intr_free_epoll_fd(intr_handle); + if (intr_handle->max_intr > intr_handle->nb_efd) { + for (i = 0; i < intr_handle->nb_efd; i++) + close(intr_handle->efds[i]); + } + intr_handle->nb_efd = 0; + intr_handle->max_intr = 0; +} + +int +rte_intr_dp_is_en(struct rte_intr_handle *intr_handle) +{ + return !(!intr_handle->nb_efd); +} + +int +rte_intr_allow_others(struct rte_intr_handle *intr_handle) +{ + if (!rte_intr_dp_is_en(intr_handle)) + return 1; + else + return !!(intr_handle->max_intr - intr_handle->nb_efd); +} + +int +rte_intr_cap_multiple(struct rte_intr_handle *intr_handle) +{ + if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) + return 1; + + if (intr_handle->type == RTE_INTR_HANDLE_VDEV) + return 1; + + return 0; +} + +int rte_thread_is_intr(void) +{ + return pthread_equal(intr_thread, pthread_self()); +} diff --git a/lib/librte_eal/linux/eal_lcore.c b/lib/librte_eal/linux/eal_lcore.c new file mode 100644 index 0000000000..bc8965844c --- /dev/null +++ b/lib/librte_eal/linux/eal_lcore.c @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_filesystem.h" +#include "eal_thread.h" + +#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u" +#define CORE_ID_FILE "topology/core_id" +#define NUMA_NODE_PATH "/sys/devices/system/node" + +/* Check if a cpu is present by the presence of the cpu information for it */ +int +eal_cpu_detected(unsigned lcore_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), SYS_CPU_DIR + "/"CORE_ID_FILE, lcore_id); + if (len <= 0 || (unsigned)len >= sizeof(path)) + return 0; + if (access(path, F_OK) != 0) + return 0; + + return 1; +} + +/* + * Get CPU socket id (NUMA node) for a logical core. + * + * This searches each nodeX directories in /sys for the symlink for the given + * lcore_id and returns the numa node where the lcore is found. If lcore is not + * found on any numa node, returns zero. + */ +unsigned +eal_cpu_socket_id(unsigned lcore_id) +{ + unsigned socket; + + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH, + socket, lcore_id); + if (access(path, F_OK) == 0) + return socket; + } + return 0; +} + +/* Get the cpu core id value from the /sys/.../cpuX core_id value */ +unsigned +eal_cpu_core_id(unsigned lcore_id) +{ + char path[PATH_MAX]; + unsigned long id; + + int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE); + if (len <= 0 || (unsigned)len >= sizeof(path)) + goto err; + if (eal_parse_sysfs_value(path, &id) != 0) + goto err; + return (unsigned)id; + +err: + RTE_LOG(ERR, EAL, "Error reading core id value from %s " + "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id); + return 0; +} diff --git a/lib/librte_eal/linux/eal_log.c b/lib/librte_eal/linux/eal_log.c new file mode 100644 index 0000000000..9d02dddbed --- /dev/null +++ b/lib/librte_eal/linux/eal_log.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +/* + * default log function + */ +static ssize_t +console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size) +{ + ssize_t ret; + + /* write on stdout */ + ret = fwrite(buf, 1, size, stdout); + fflush(stdout); + + /* Syslog error levels are from 0 to 7, so subtract 1 to convert */ + syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf); + + return ret; +} + +static cookie_io_functions_t console_log_func = { + .write = console_log_write, +}; + +/* + * set the log to default function, called during eal init process, + * once memzones are available. + */ +int +rte_eal_log_init(const char *id, int facility) +{ + FILE *log_stream; + + log_stream = fopencookie(NULL, "w+", console_log_func); + if (log_stream == NULL) + return -1; + + openlog(id, LOG_NDELAY | LOG_PID, facility); + + eal_log_set_default(log_stream); + + return 0; +} diff --git a/lib/librte_eal/linux/eal_memalloc.c b/lib/librte_eal/linux/eal_memalloc.c new file mode 100644 index 0000000000..af6d0d023a --- /dev/null +++ b/lib/librte_eal/linux/eal_memalloc.c @@ -0,0 +1,1604 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2017-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif +#include +#include /* for hugetlb-related mmap flags */ + +#include +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_internal_cfg.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_private.h" + +const int anonymous_hugepages_supported = +#ifdef MAP_HUGE_SHIFT + 1; +#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT +#else + 0; +#define RTE_MAP_HUGE_SHIFT 26 +#endif + +/* + * we've already checked memfd support at compile-time, but we also need to + * check if we can create hugepage files with memfd. + * + * also, this is not a constant, because while we may be *compiled* with memfd + * hugetlbfs support, we might not be *running* on a system that supports memfd + * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at + * runtime, and fall back to anonymous memory. + */ +static int memfd_create_supported = +#ifdef MFD_HUGETLB + 1; +#define RTE_MFD_HUGETLB MFD_HUGETLB +#else + 0; +#define RTE_MFD_HUGETLB 4U +#endif + +/* + * not all kernel version support fallocate on hugetlbfs, so fall back to + * ftruncate and disallow deallocation if fallocate is not supported. + */ +static int fallocate_supported = -1; /* unknown */ + +/* + * we have two modes - single file segments, and file-per-page mode. + * + * for single-file segments, we use memseg_list_fd to store the segment fd, + * while the fds[] will not be allocated, and len will be set to 0. + * + * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd' + * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's. + * + * we cannot know how many pages a system will have in advance, but we do know + * that they come in lists, and we know lengths of these lists. so, simply store + * a malloc'd array of fd's indexed by list and segment index. + * + * they will be initialized at startup, and filled as we allocate/deallocate + * segments. + */ +static struct { + int *fds; /**< dynamically allocated array of segment lock fd's */ + int memseg_list_fd; /**< memseg list fd */ + int len; /**< total length of the array */ + int count; /**< entries used in an array */ +} fd_list[RTE_MAX_MEMSEG_LISTS]; + +/** local copy of a memory map, used to synchronize memory hotplug in MP */ +static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS]; + +static sigjmp_buf huge_jmpenv; + +static void __rte_unused huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int __rte_unused huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void __rte_unused +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void __rte_unused +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +static bool +check_numa(void) +{ + bool ret = true; + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + ret = false; + } + return ret; +} + +static void +prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) +{ + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + if (get_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + *oldpolicy = MPOL_DEFAULT; + } + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + socket_id); + numa_set_preferred(socket_id); +} + +static void +restore_numa(int *oldpolicy, struct bitmask *oldmask) +{ + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", *oldpolicy); + if (*oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(*oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + numa_free_cpumask(oldmask); +} +#endif + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +get_file_size(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +static int +pagesz_flags(uint64_t page_sz) +{ + /* as per mmap() manpage, all page sizes are log2 of page size + * shifted by MAP_HUGE_SHIFT + */ + int log2 = rte_log2_u64(page_sz); + return log2 << RTE_MAP_HUGE_SHIFT; +} + +/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */ +static int lock(int fd, int type) +{ + int ret; + + /* flock may be interrupted */ + do { + ret = flock(fd, type | LOCK_NB); + } while (ret && errno == EINTR); + + if (ret && errno == EWOULDBLOCK) { + /* couldn't lock */ + return 0; + } else if (ret) { + RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n", + __func__, strerror(errno)); + return -1; + } + /* lock was successful */ + return 1; +} + +static int +get_seg_memfd(struct hugepage_info *hi __rte_unused, + unsigned int list_idx __rte_unused, + unsigned int seg_idx __rte_unused) +{ +#ifdef MEMFD_SUPPORTED + int fd; + char segname[250]; /* as per manpage, limit is 249 bytes plus null */ + + int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz); + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i", list_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + snprintf(segname, sizeof(segname), "seg_%i-%i", + list_idx, seg_idx); + fd = memfd_create(segname, flags); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n", + __func__, strerror(errno)); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +#endif + return -1; +} + +static int +get_seg_fd(char *path, int buflen, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + int fd; + + /* for in-memory mode, we only make it here when we're sure we support + * memfd, and this is a special case. + */ + if (internal_config.in_memory) + return get_seg_memfd(hi, list_idx, seg_idx); + + if (internal_config.single_file_segments) { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx); + + fd = fd_list[list_idx].memseg_list_fd; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock and keep it indefinitely */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].memseg_list_fd = fd; + } + } else { + /* create a hugepage file path */ + eal_get_hugefile_path(path, buflen, hi->hugedir, + list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx); + + fd = fd_list[list_idx].fds[seg_idx]; + + if (fd < 0) { + fd = open(path, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", + __func__, strerror(errno)); + return -1; + } + /* take out a read lock */ + if (lock(fd, LOCK_SH) < 0) { + RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n", + __func__, strerror(errno)); + close(fd); + return -1; + } + fd_list[list_idx].fds[seg_idx] = fd; + } + } + return fd; +} + +static int +resize_hugefile_in_memory(int fd, uint64_t fa_offset, + uint64_t page_sz, bool grow) +{ + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + return 0; +} + +static int +resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz, + bool grow) +{ + bool again = false; + + do { + if (fallocate_supported == 0) { + /* we cannot deallocate memory if fallocate() is not + * supported, and hugepage file is already locked at + * creation, so no further synchronization needed. + */ + + if (!grow) { + RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n", + __func__); + return -1; + } + uint64_t new_size = fa_offset + page_sz; + uint64_t cur_size = get_file_size(fd); + + /* fallocate isn't supported, fall back to ftruncate */ + if (new_size > cur_size && + ftruncate(fd, new_size) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + return -1; + } + } else { + int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_KEEP_SIZE; + int ret; + + /* + * technically, it is perfectly safe for both primary + * and secondary to grow and shrink the page files: + * growing the file repeatedly has no effect because + * a page can only be allocated once, while mmap ensures + * that secondaries hold on to the page even after the + * page itself is removed from the filesystem. + * + * however, leaving growing/shrinking to the primary + * tends to expose bugs in fdlist page count handling, + * so leave this here just in case. + */ + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + + /* grow or shrink the file */ + ret = fallocate(fd, flags, fa_offset, page_sz); + + if (ret < 0) { + if (fallocate_supported == -1 && + errno == ENOTSUP) { + RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n", + __func__); + again = true; + fallocate_supported = 0; + } else { + RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n", + __func__, + strerror(errno)); + return -1; + } + } else + fallocate_supported = 1; + } + } while (again); + + return 0; +} + +static void +close_hugefile(int fd, char *path, int list_idx) +{ + /* + * primary process must unlink the file, but only when not in in-memory + * mode (as in that case there is no file to unlink). + */ + if (!internal_config.in_memory && + rte_eal_process_type() == RTE_PROC_PRIMARY && + unlink(path)) + RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n", + __func__, path, strerror(errno)); + + close(fd); + fd_list[list_idx].memseg_list_fd = -1; +} + +static int +resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow) +{ + /* in-memory mode is a special case, because we can be sure that + * fallocate() is supported. + */ + if (internal_config.in_memory) + return resize_hugefile_in_memory(fd, fa_offset, + page_sz, grow); + + return resize_hugefile_in_filesystem(fd, fa_offset, page_sz, + grow); +} + +static int +alloc_seg(struct rte_memseg *ms, void *addr, int socket_id, + struct hugepage_info *hi, unsigned int list_idx, + unsigned int seg_idx) +{ +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int cur_socket_id = 0; +#endif + uint64_t map_offset; + rte_iova_t iova; + void *va; + char path[PATH_MAX]; + int ret = 0; + int fd; + size_t alloc_sz; + int flags; + void *new_addr; + + alloc_sz = hi->hugepage_sz; + + /* these are checked at init, but code analyzers don't know that */ + if (internal_config.in_memory && !anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n"); + return -1; + } + if (internal_config.in_memory && !memfd_create_supported && + internal_config.single_file_segments) { + RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n"); + return -1; + } + + /* in-memory without memfd is a special case */ + int mmap_flags; + + if (internal_config.in_memory && !memfd_create_supported) { + const int in_memory_flags = MAP_HUGETLB | MAP_FIXED | + MAP_PRIVATE | MAP_ANONYMOUS; + int pagesz_flag; + + pagesz_flag = pagesz_flags(alloc_sz); + fd = -1; + mmap_flags = in_memory_flags | pagesz_flag; + + /* single-file segments codepath will never be active + * here because in-memory mode is incompatible with the + * fallback path, and it's stopped at EAL initialization + * stage. + */ + map_offset = 0; + } else { + /* takes out a read lock on segment or segment list */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n"); + return -1; + } + + if (internal_config.single_file_segments) { + map_offset = seg_idx * alloc_sz; + ret = resize_hugefile(fd, map_offset, alloc_sz, true); + if (ret < 0) + goto resized; + + fd_list[list_idx].count++; + } else { + map_offset = 0; + if (ftruncate(fd, alloc_sz) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + if (internal_config.hugepage_unlink && + !internal_config.in_memory) { + if (unlink(path)) { + RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n", + __func__, strerror(errno)); + goto resized; + } + } + } + mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED; + } + + /* + * map the segment, and populate page tables, the kernel fills + * this segment with zeros if it's a new page. + */ + va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd, + map_offset); + + if (va == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__, + strerror(errno)); + /* mmap failed, but the previous region might have been + * unmapped anyway. try to remap it + */ + goto unmapped; + } + if (va != addr) { + RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__); + munmap(va, alloc_sz); + goto resized; + } + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n", + (unsigned int)(alloc_sz >> 20)); + goto mapped; + } + + /* we need to trigger a write to the page to enforce page fault and + * ensure that page is accessible to us, but we can't overwrite value + * that is already there, so read the old value, and write itback. + * kernel populates the page with zeroes initially. + */ + *(volatile int *)addr = *(volatile int *)addr; + + iova = rte_mem_virt2iova(addr); + if (iova == RTE_BAD_PHYS_ADDR) { + RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n", + __func__); + goto mapped; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + ret = get_mempolicy(&cur_socket_id, NULL, 0, addr, + MPOL_F_NODE | MPOL_F_ADDR); + if (ret < 0) { + RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n", + __func__, strerror(errno)); + goto mapped; + } else if (cur_socket_id != socket_id) { + RTE_LOG(DEBUG, EAL, + "%s(): allocation happened on wrong socket (wanted %d, got %d)\n", + __func__, socket_id, cur_socket_id); + goto mapped; + } +#else + if (rte_socket_count() > 1) + RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n", + __func__); +#endif + + ms->addr = addr; + ms->hugepage_sz = alloc_sz; + ms->len = alloc_sz; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + ms->iova = iova; + ms->socket_id = socket_id; + + return 0; + +mapped: + munmap(addr, alloc_sz); +unmapped: + flags = MAP_FIXED; + new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags); + if (new_addr != addr) { + if (new_addr != NULL) + munmap(new_addr, alloc_sz); + /* we're leaving a hole in our virtual address space. if + * somebody else maps this hole now, we could accidentally + * override it in the future. + */ + RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n"); + } + /* roll back the ref count */ + if (internal_config.single_file_segments) + fd_list[list_idx].count--; +resized: + /* some codepaths will return negative fd, so exit early */ + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + resize_hugefile(fd, map_offset, alloc_sz, false); + /* ignore failure, can't make it any worse */ + + /* if refcount is at zero, close the file */ + if (fd_list[list_idx].count == 0) + close_hugefile(fd, path, list_idx); + } else { + /* only remove file if we can take out a write lock */ + if (internal_config.hugepage_unlink == 0 && + internal_config.in_memory == 0 && + lock(fd, LOCK_EX) == 1) + unlink(path); + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + return -1; +} + +static int +free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + unsigned int list_idx, unsigned int seg_idx) +{ + uint64_t map_offset; + char path[PATH_MAX]; + int fd, ret = 0; + bool exit_early; + + /* erase page data */ + memset(ms->addr, 0, ms->len); + + if (mmap(ms->addr, ms->len, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); + return -1; + } + + exit_early = false; + + /* if we're using anonymous hugepages, nothing to be done */ + if (internal_config.in_memory && !memfd_create_supported) + exit_early = true; + + /* if we've already unlinked the page, nothing needs to be done */ + if (!internal_config.in_memory && internal_config.hugepage_unlink) + exit_early = true; + + if (exit_early) { + memset(ms, 0, sizeof(*ms)); + return 0; + } + + /* if we are not in single file segments mode, we're going to unmap the + * segment and thus drop the lock on original fd, but hugepage dir is + * now locked so we can take out another one without races. + */ + fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx); + if (fd < 0) + return -1; + + if (internal_config.single_file_segments) { + map_offset = seg_idx * ms->len; + if (resize_hugefile(fd, map_offset, ms->len, false)) + return -1; + + if (--(fd_list[list_idx].count) == 0) + close_hugefile(fd, path, list_idx); + + ret = 0; + } else { + /* if we're able to take out a write lock, we're the last one + * holding onto this page. + */ + if (!internal_config.in_memory) { + ret = lock(fd, LOCK_EX); + if (ret >= 0) { + /* no one else is using this page */ + if (ret == 1) + unlink(path); + } + } + /* closing fd will drop the lock */ + close(fd); + fd_list[list_idx].fds[seg_idx] = -1; + } + + memset(ms, 0, sizeof(*ms)); + + return ret < 0 ? -1 : 0; +} + +struct alloc_walk_param { + struct hugepage_info *hi; + struct rte_memseg **ms; + size_t page_sz; + unsigned int segs_allocated; + unsigned int n_segs; + int socket; + bool exact; +}; +static int +alloc_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct alloc_walk_param *wa = arg; + struct rte_memseg_list *cur_msl; + size_t page_sz; + int cur_idx, start_idx, j, dir_fd = -1; + unsigned int msl_idx, need, i; + + if (msl->page_sz != wa->page_sz) + return 0; + if (msl->socket_id != wa->socket) + return 0; + + page_sz = (size_t)msl->page_sz; + + msl_idx = msl - mcfg->memsegs; + cur_msl = &mcfg->memsegs[msl_idx]; + + need = wa->n_segs; + + /* try finding space in memseg list */ + if (wa->exact) { + /* if we require exact number of pages in a list, find them */ + cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, + need); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + } else { + int cur_len; + + /* we don't require exact number of pages, so we're going to go + * for best-effort allocation. that means finding the biggest + * unused block, and going with that. + */ + cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr, + 0); + if (cur_idx < 0) + return 0; + start_idx = cur_idx; + /* adjust the size to possibly be smaller than original + * request, but do not allow it to be bigger. + */ + cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr, + cur_idx); + need = RTE_MIN(need, (unsigned int)cur_len); + } + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + for (i = 0; i < need; i++, cur_idx++) { + struct rte_memseg *cur; + void *map_addr; + + cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx); + map_addr = RTE_PTR_ADD(cur_msl->base_va, + cur_idx * page_sz); + + if (alloc_seg(cur, map_addr, wa->socket, wa->hi, + msl_idx, cur_idx)) { + RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n", + need, i); + + /* if exact number wasn't requested, stop */ + if (!wa->exact) + goto out; + + /* clean up */ + for (j = start_idx; j < cur_idx; j++) { + struct rte_memseg *tmp; + struct rte_fbarray *arr = + &cur_msl->memseg_arr; + + tmp = rte_fbarray_get(arr, j); + rte_fbarray_set_free(arr, j); + + /* free_seg may attempt to create a file, which + * may fail. + */ + if (free_seg(tmp, wa->hi, msl_idx, j)) + RTE_LOG(DEBUG, EAL, "Cannot free page\n"); + } + /* clear the list */ + if (wa->ms) + memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs); + + if (dir_fd >= 0) + close(dir_fd); + return -1; + } + if (wa->ms) + wa->ms[i] = cur; + + rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx); + } +out: + wa->segs_allocated = i; + if (i > 0) + cur_msl->version++; + if (dir_fd >= 0) + close(dir_fd); + /* if we didn't allocate any segments, move on to the next list */ + return i > 0; +} + +struct free_walk_param { + struct hugepage_info *hi; + struct rte_memseg *ms; +}; +static int +free_seg_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *found_msl; + struct free_walk_param *wa = arg; + uintptr_t start_addr, end_addr; + int msl_idx, seg_idx, ret, dir_fd = -1; + + start_addr = (uintptr_t) msl->base_va; + end_addr = start_addr + msl->len; + + if ((uintptr_t)wa->ms->addr < start_addr || + (uintptr_t)wa->ms->addr >= end_addr) + return 0; + + msl_idx = msl - mcfg->memsegs; + seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz; + + /* msl is const */ + found_msl = &mcfg->memsegs[msl_idx]; + + /* do not allow any page allocations during the time we're freeing, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + * + * during init, we already hold a write lock, so don't try to take out + * another one. + */ + if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) { + dir_fd = open(wa->hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", + __func__, wa->hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + } + + found_msl->version++; + + rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx); + + ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx); + + if (dir_fd >= 0) + close(dir_fd); + + if (ret < 0) + return -1; + + return 1; +} + +int +eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz, + int socket, bool exact) +{ + int i, ret = -1; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + bool have_numa = false; + int oldpolicy; + struct bitmask *oldmask; +#endif + struct alloc_walk_param wa; + struct hugepage_info *hi = NULL; + + memset(&wa, 0, sizeof(wa)); + + /* dynamic allocation not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) { + if (page_sz == + internal_config.hugepage_info[i].hugepage_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n", + __func__); + return -1; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (check_numa()) { + oldmask = numa_allocate_nodemask(); + prepare_numa(&oldpolicy, oldmask, socket); + have_numa = true; + } +#endif + + wa.exact = exact; + wa.hi = hi; + wa.ms = ms; + wa.n_segs = n_segs; + wa.page_sz = page_sz; + wa.socket = socket; + wa.segs_allocated = 0; + + /* memalloc is locked, so it's safe to use thread-unsafe version */ + ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa); + if (ret == 0) { + RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n", + __func__); + ret = -1; + } else if (ret > 0) { + ret = (int)wa.segs_allocated; + } + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (have_numa) + restore_numa(&oldpolicy, oldmask); +#endif + return ret; +} + +struct rte_memseg * +eal_memalloc_alloc_seg(size_t page_sz, int socket) +{ + struct rte_memseg *ms; + if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0) + return NULL; + /* return pointer to newly allocated memseg */ + return ms; +} + +int +eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs) +{ + int seg, ret = 0; + + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + for (seg = 0; seg < n_segs; seg++) { + struct rte_memseg *cur = ms[seg]; + struct hugepage_info *hi = NULL; + struct free_walk_param wa; + int i, walk_res; + + /* if this page is marked as unfreeable, fail */ + if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) { + RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n"); + ret = -1; + continue; + } + + memset(&wa, 0, sizeof(wa)); + + for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info); + i++) { + hi = &internal_config.hugepage_info[i]; + if (cur->hugepage_sz == hi->hugepage_sz) + break; + } + if (i == (int)RTE_DIM(internal_config.hugepage_info)) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + ret = -1; + continue; + } + + wa.ms = cur; + wa.hi = hi; + + /* memalloc is locked, so it's safe to use thread-unsafe version + */ + walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk, + &wa); + if (walk_res == 1) + continue; + if (walk_res == 0) + RTE_LOG(ERR, EAL, "Couldn't find memseg list\n"); + ret = -1; + } + return ret; +} + +int +eal_memalloc_free_seg(struct rte_memseg *ms) +{ + /* dynamic free not supported in legacy mode */ + if (internal_config.legacy_mem) + return -1; + + return eal_memalloc_free_seg_bulk(&ms, 1); +} + +static int +sync_chunk(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used, int start, int end) +{ + struct rte_fbarray *l_arr, *p_arr; + int i, ret, chunk_len, diff_len; + + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + /* we need to aggregate allocations/deallocations into bigger chunks, + * as we don't want to spam the user with per-page callbacks. + * + * to avoid any potential issues, we also want to trigger + * deallocation callbacks *before* we actually deallocate + * memory, so that the user application could wrap up its use + * before it goes away. + */ + + chunk_len = end - start; + + /* find how many contiguous pages we can map/unmap for this chunk */ + diff_len = used ? + rte_fbarray_find_contig_free(l_arr, start) : + rte_fbarray_find_contig_used(l_arr, start); + + /* has to be at least one page */ + if (diff_len < 1) + return -1; + + diff_len = RTE_MIN(chunk_len, diff_len); + + /* if we are freeing memory, notify the application */ + if (!used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE, + start_va, len); + } + + for (i = 0; i < diff_len; i++) { + struct rte_memseg *p_ms, *l_ms; + int seg_idx = start + i; + + l_ms = rte_fbarray_get(l_arr, seg_idx); + p_ms = rte_fbarray_get(p_arr, seg_idx); + + if (l_ms == NULL || p_ms == NULL) + return -1; + + if (used) { + ret = alloc_seg(l_ms, p_ms->addr, + p_ms->socket_id, hi, + msl_idx, seg_idx); + if (ret < 0) + return -1; + rte_fbarray_set_used(l_arr, seg_idx); + } else { + ret = free_seg(l_ms, hi, msl_idx, seg_idx); + rte_fbarray_set_free(l_arr, seg_idx); + if (ret < 0) + return -1; + } + } + + /* if we just allocated memory, notify the application */ + if (used) { + struct rte_memseg *ms; + void *start_va; + size_t len, page_sz; + + ms = rte_fbarray_get(l_arr, start); + start_va = ms->addr; + page_sz = (size_t)primary_msl->page_sz; + len = page_sz * diff_len; + + eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC, + start_va, len); + } + + /* calculate how much we can advance until next chunk */ + diff_len = used ? + rte_fbarray_find_contig_used(l_arr, start) : + rte_fbarray_find_contig_free(l_arr, start); + ret = RTE_MIN(chunk_len, diff_len); + + return ret; +} + +static int +sync_status(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx, bool used) +{ + struct rte_fbarray *l_arr, *p_arr; + int p_idx, l_chunk_len, p_chunk_len, ret; + int start, end; + + /* this is a little bit tricky, but the basic idea is - walk both lists + * and spot any places where there are discrepancies. walking both lists + * and noting discrepancies in a single go is a hard problem, so we do + * it in two passes - first we spot any places where allocated segments + * mismatch (i.e. ensure that everything that's allocated in the primary + * is also allocated in the secondary), and then we do it by looking at + * free segments instead. + * + * we also need to aggregate changes into chunks, as we have to call + * callbacks per allocation, not per page. + */ + l_arr = &local_msl->memseg_arr; + p_arr = &primary_msl->memseg_arr; + + if (used) + p_idx = rte_fbarray_find_next_used(p_arr, 0); + else + p_idx = rte_fbarray_find_next_free(p_arr, 0); + + while (p_idx >= 0) { + int next_chunk_search_idx; + + if (used) { + p_chunk_len = rte_fbarray_find_contig_used(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_used(l_arr, + p_idx); + } else { + p_chunk_len = rte_fbarray_find_contig_free(p_arr, + p_idx); + l_chunk_len = rte_fbarray_find_contig_free(l_arr, + p_idx); + } + /* best case scenario - no differences (or bigger, which will be + * fixed during next iteration), look for next chunk + */ + if (l_chunk_len >= p_chunk_len) { + next_chunk_search_idx = p_idx + p_chunk_len; + goto next_chunk; + } + + /* if both chunks start at the same point, skip parts we know + * are identical, and sync the rest. each call to sync_chunk + * will only sync contiguous segments, so we need to call this + * until we are sure there are no more differences in this + * chunk. + */ + start = p_idx + l_chunk_len; + end = p_idx + p_chunk_len; + do { + ret = sync_chunk(primary_msl, local_msl, hi, msl_idx, + used, start, end); + start += ret; + } while (start < end && ret >= 0); + /* if ret is negative, something went wrong */ + if (ret < 0) + return -1; + + next_chunk_search_idx = p_idx + p_chunk_len; +next_chunk: + /* skip to end of this chunk */ + if (used) { + p_idx = rte_fbarray_find_next_used(p_arr, + next_chunk_search_idx); + } else { + p_idx = rte_fbarray_find_next_free(p_arr, + next_chunk_search_idx); + } + } + return 0; +} + +static int +sync_existing(struct rte_memseg_list *primary_msl, + struct rte_memseg_list *local_msl, struct hugepage_info *hi, + unsigned int msl_idx) +{ + int ret, dir_fd; + + /* do not allow any page allocations during the time we're allocating, + * because file creation and locking operations are not atomic, + * and we might be the first or the last ones to use a particular page, + * so we need to ensure atomicity of every operation. + */ + dir_fd = open(hi->hugedir, O_RDONLY); + if (dir_fd < 0) { + RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + return -1; + } + /* blocking writelock */ + if (flock(dir_fd, LOCK_EX)) { + RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__, + hi->hugedir, strerror(errno)); + close(dir_fd); + return -1; + } + + /* ensure all allocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, true); + if (ret < 0) + goto fail; + + /* ensure all unallocated space is the same in both lists */ + ret = sync_status(primary_msl, local_msl, hi, msl_idx, false); + if (ret < 0) + goto fail; + + /* update version number */ + local_msl->version = primary_msl->version; + + close(dir_fd); + + return 0; +fail: + close(dir_fd); + return -1; +} + +static int +sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + struct hugepage_info *hi = NULL; + unsigned int i; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) { + uint64_t cur_sz = + internal_config.hugepage_info[i].hugepage_sz; + uint64_t msl_sz = primary_msl->page_sz; + if (msl_sz == cur_sz) { + hi = &internal_config.hugepage_info[i]; + break; + } + } + if (!hi) { + RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n"); + return -1; + } + + /* if versions don't match, synchronize everything */ + if (local_msl->version != primary_msl->version && + sync_existing(primary_msl, local_msl, hi, msl_idx)) + return -1; + return 0; +} + + +int +eal_memalloc_sync_with_primary(void) +{ + /* nothing to be done in primary */ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) + return 0; + + /* memalloc is locked, so it's safe to call thread-unsafe version */ + if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL)) + return -1; + return 0; +} + +static int +secondary_msl_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *primary_msl, *local_msl; + char name[PATH_MAX]; + int msl_idx, ret; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + primary_msl = &mcfg->memsegs[msl_idx]; + local_msl = &local_memsegs[msl_idx]; + + /* create distinct fbarrays for each secondary */ + snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i", + primary_msl->memseg_arr.name, getpid()); + + ret = rte_fbarray_init(&local_msl->memseg_arr, name, + primary_msl->memseg_arr.len, + primary_msl->memseg_arr.elt_sz); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n"); + return -1; + } + local_msl->base_va = primary_msl->base_va; + local_msl->len = primary_msl->len; + + return 0; +} + +static int +alloc_list(int list_idx, int len) +{ + int *data; + int i; + + /* single-file segments mode does not need fd list */ + if (!internal_config.single_file_segments) { + /* ensure we have space to store fd per each possible segment */ + data = malloc(sizeof(int) * len); + if (data == NULL) { + RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n"); + return -1; + } + /* set all fd's as invalid */ + for (i = 0; i < len; i++) + data[i] = -1; + fd_list[list_idx].fds = data; + fd_list[list_idx].len = len; + } else { + fd_list[list_idx].fds = NULL; + fd_list[list_idx].len = 0; + } + + fd_list[list_idx].count = 0; + fd_list[list_idx].memseg_list_fd = -1; + + return 0; +} + +static int +fd_list_create_walk(const struct rte_memseg_list *msl, + void *arg __rte_unused) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + unsigned int len; + int msl_idx; + + if (msl->external) + return 0; + + msl_idx = msl - mcfg->memsegs; + len = msl->memseg_arr.len; + + return alloc_list(msl_idx, len); +} + +int +eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + /* single file segments mode doesn't support individual segment fd's */ + if (internal_config.single_file_segments) + return -ENOTSUP; + + /* if list is not allocated, allocate it */ + if (fd_list[list_idx].len == 0) { + int len = mcfg->memsegs[list_idx].memseg_arr.len; + + if (alloc_list(list_idx, len) < 0) + return -ENOMEM; + } + fd_list[list_idx].fds[seg_idx] = fd; + + return 0; +} + +int +eal_memalloc_set_seg_list_fd(int list_idx, int fd) +{ + /* non-single file segment mode doesn't support segment list fd's */ + if (!internal_config.single_file_segments) + return -ENOTSUP; + + fd_list[list_idx].memseg_list_fd = fd; + + return 0; +} + +int +eal_memalloc_get_seg_fd(int list_idx, int seg_idx) +{ + int fd; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + if (internal_config.single_file_segments) { + fd = fd_list[list_idx].memseg_list_fd; + } else if (fd_list[list_idx].len == 0) { + /* list not initialized */ + fd = -1; + } else { + fd = fd_list[list_idx].fds[seg_idx]; + } + if (fd < 0) + return -ENODEV; + return fd; +} + +static int +test_memfd_create(void) +{ +#ifdef MEMFD_SUPPORTED + unsigned int i; + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz; + int pagesz_flag = pagesz_flags(pagesz); + int flags; + + flags = pagesz_flag | RTE_MFD_HUGETLB; + int fd = memfd_create("test", flags); + if (fd < 0) { + /* we failed - let memalloc know this isn't working */ + if (errno == EINVAL) { + memfd_create_supported = 0; + return 0; /* not supported */ + } + + /* we got other error - something's wrong */ + return -1; /* error */ + } + close(fd); + return 1; /* supported */ + } +#endif + return 0; /* not supported */ +} + +int +eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + + if (internal_config.in_memory || internal_config.no_hugetlbfs) { +#ifndef MEMFD_SUPPORTED + /* in in-memory or no-huge mode, we rely on memfd support */ + return -ENOTSUP; +#endif + /* memfd supported, but hugetlbfs memfd may not be */ + if (!internal_config.no_hugetlbfs && !memfd_create_supported) + return -ENOTSUP; + } + + if (internal_config.single_file_segments) { + size_t pgsz = mcfg->memsegs[list_idx].page_sz; + + /* segment not active? */ + if (fd_list[list_idx].memseg_list_fd < 0) + return -ENOENT; + *offset = pgsz * seg_idx; + } else { + /* fd_list not initialized? */ + if (fd_list[list_idx].len == 0) + return -ENODEV; + + /* segment not active? */ + if (fd_list[list_idx].fds[seg_idx] < 0) + return -ENOENT; + *offset = 0; + } + return 0; +} + +int +eal_memalloc_init(void) +{ + if (rte_eal_process_type() == RTE_PROC_SECONDARY) + if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0) + return -1; + if (rte_eal_process_type() == RTE_PROC_PRIMARY && + internal_config.in_memory) { + int mfd_res = test_memfd_create(); + + if (mfd_res < 0) { + RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n"); + return -1; + } + if (mfd_res == 1) + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + else + RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n"); + + /* we only support single-file segments mode with in-memory mode + * if we support hugetlbfs with memfd_create. this code will + * test if we do. + */ + if (internal_config.single_file_segments && + mfd_res != 1) { + RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n"); + return -1; + } + /* this cannot ever happen but better safe than sorry */ + if (!anonymous_hugepages_supported) { + RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n"); + return -1; + } + } + + /* initialize all of the fd lists */ + if (rte_memseg_list_walk(fd_list_create_walk, NULL)) + return -1; + return 0; +} diff --git a/lib/librte_eal/linux/eal_memory.c b/lib/librte_eal/linux/eal_memory.c new file mode 100644 index 0000000000..7a9c97ff88 --- /dev/null +++ b/lib/librte_eal/linux/eal_memory.c @@ -0,0 +1,2481 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2013 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */ +#include +#define MEMFD_SUPPORTED +#endif +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_memalloc.h" +#include "eal_memcfg.h" +#include "eal_internal_cfg.h" +#include "eal_filesystem.h" +#include "eal_hugepages.h" +#include "eal_options.h" + +#define PFN_MASK_SIZE 8 + +/** + * @file + * Huge page mapping under linux + * + * To reserve a big contiguous amount of memory, we use the hugepage + * feature of linux. For that, we need to have hugetlbfs mounted. This + * code will create many files in this directory (one per page) and + * map them in virtual memory. For each page, we will retrieve its + * physical address and remap it in order to have a virtual contiguous + * zone as well as a physical contiguous zone. + */ + +static int phys_addrs_available = -1; + +#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" + +uint64_t eal_get_baseaddr(void) +{ + /* + * Linux kernel uses a really high address as starting address for + * serving mmaps calls. If there exists addressing limitations and IOVA + * mode is VA, this starting address is likely too high for those + * devices. However, it is possible to use a lower address in the + * process virtual address space as with 64 bits there is a lot of + * available space. + * + * Current known limitations are 39 or 40 bits. Setting the starting + * address at 4GB implies there are 508GB or 1020GB for mapping the + * available hugepages. This is likely enough for most systems, although + * a device with addressing limitations should call + * rte_mem_check_dma_mask for ensuring all memory is within supported + * range. + */ + return 0x100000000ULL; +} + +/* + * Get physical address of any mapped virtual address in the current process. + */ +phys_addr_t +rte_mem_virt2phy(const void *virtaddr) +{ + int fd, retval; + uint64_t page, physaddr; + unsigned long virt_pfn; + int page_size; + off_t offset; + + if (phys_addrs_available == 0) + return RTE_BAD_IOVA; + + /* standard page size */ + page_size = getpagesize(); + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) { + RTE_LOG(INFO, EAL, "%s(): cannot open /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } + + virt_pfn = (unsigned long)virtaddr / page_size; + offset = sizeof(uint64_t) * virt_pfn; + if (lseek(fd, offset, SEEK_SET) == (off_t) -1) { + RTE_LOG(INFO, EAL, "%s(): seek error in /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + close(fd); + return RTE_BAD_IOVA; + } + + retval = read(fd, &page, PFN_MASK_SIZE); + close(fd); + if (retval < 0) { + RTE_LOG(INFO, EAL, "%s(): cannot read /proc/self/pagemap: %s\n", + __func__, strerror(errno)); + return RTE_BAD_IOVA; + } else if (retval != PFN_MASK_SIZE) { + RTE_LOG(INFO, EAL, "%s(): read %d bytes from /proc/self/pagemap " + "but expected %d:\n", + __func__, retval, PFN_MASK_SIZE); + return RTE_BAD_IOVA; + } + + /* + * the pfn (page frame number) are bits 0-54 (see + * pagemap.txt in linux Documentation) + */ + if ((page & 0x7fffffffffffffULL) == 0) + return RTE_BAD_IOVA; + + physaddr = ((page & 0x7fffffffffffffULL) * page_size) + + ((unsigned long)virtaddr % page_size); + + return physaddr; +} + +rte_iova_t +rte_mem_virt2iova(const void *virtaddr) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + return (uintptr_t)virtaddr; + return rte_mem_virt2phy(virtaddr); +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value. We find + * it by browsing the /proc/self/pagemap special file. + */ +static int +find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va); + if (addr == RTE_BAD_PHYS_ADDR) + return -1; + hugepg_tbl[i].physaddr = addr; + } + return 0; +} + +/* + * For each hugepage in hugepg_tbl, fill the physaddr value sequentially. + */ +static int +set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + unsigned int i; + static phys_addr_t addr; + + for (i = 0; i < hpi->num_pages[0]; i++) { + hugepg_tbl[i].physaddr = addr; + addr += hugepg_tbl[i].size; + } + return 0; +} + +/* + * Check whether address-space layout randomization is enabled in + * the kernel. This is important for multi-process as it can prevent + * two processes mapping data to the same virtual address + * Returns: + * 0 - address space randomization disabled + * 1/2 - address space randomization enabled + * negative error code on error + */ +static int +aslr_enabled(void) +{ + char c; + int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY); + if (fd < 0) + return -errno; + retval = read(fd, &c, 1); + close(fd); + if (retval < 0) + return -errno; + if (retval == 0) + return -EIO; + switch (c) { + case '0' : return 0; + case '1' : return 1; + case '2' : return 2; + default: return -EINVAL; + } +} + +static sigjmp_buf huge_jmpenv; + +static void huge_sigbus_handler(int signo __rte_unused) +{ + siglongjmp(huge_jmpenv, 1); +} + +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile, + * non-static local variable in the stack frame calling sigsetjmp might be + * clobbered by a call to longjmp. + */ +static int huge_wrap_sigsetjmp(void) +{ + return sigsetjmp(huge_jmpenv, 1); +} + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES +/* Callback for numa library. */ +void numa_error(char *where) +{ + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); +} +#endif + +/* + * Mmap all hugepages of hugepage table: it first open a file in + * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the + * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored + * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to + * map contiguous physical blocks in contiguous virtual blocks. + */ +static unsigned +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi, + uint64_t *essential_memory __rte_unused) +{ + int fd; + unsigned i; + void *virtaddr; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + int node_id = -1; + int essential_prev = 0; + int oldpolicy; + struct bitmask *oldmask = NULL; + bool have_numa = true; + unsigned long maxnode = 0; + + /* Check if kernel supports NUMA. */ + if (numa_available() != 0) { + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); + have_numa = false; + } + + if (have_numa) { + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); + oldmask = numa_allocate_nodemask(); + if (get_mempolicy(&oldpolicy, oldmask->maskp, + oldmask->size + 1, 0, 0) < 0) { + RTE_LOG(ERR, EAL, + "Failed to get current mempolicy: %s. " + "Assuming MPOL_DEFAULT.\n", strerror(errno)); + oldpolicy = MPOL_DEFAULT; + } + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + if (internal_config.socket_mem[i]) + maxnode = i + 1; + } +#endif + + for (i = 0; i < hpi->num_pages[0]; i++) { + struct hugepage_file *hf = &hugepg_tbl[i]; + uint64_t hugepage_sz = hpi->hugepage_sz; + +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + unsigned int j; + + for (j = 0; j < maxnode; j++) + if (essential_memory[j]) + break; + + if (j == maxnode) { + node_id = (node_id + 1) % maxnode; + while (!internal_config.socket_mem[node_id]) { + node_id++; + node_id %= maxnode; + } + essential_prev = 0; + } else { + node_id = j; + essential_prev = essential_memory[j]; + + if (essential_memory[j] < hugepage_sz) + essential_memory[j] = 0; + else + essential_memory[j] -= hugepage_sz; + } + + RTE_LOG(DEBUG, EAL, + "Setting policy MPOL_PREFERRED for socket %d\n", + node_id); + numa_set_preferred(node_id); + } +#endif + + hf->file_id = i; + hf->size = hugepage_sz; + eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath), + hpi->hugedir, hf->file_id); + hf->filepath[sizeof(hf->filepath) - 1] = '\0'; + + /* try to create hugepage file */ + fd = open(hf->filepath, O_CREAT | O_RDWR, 0600); + if (fd < 0) { + RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, + strerror(errno)); + goto out; + } + + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros. we don't care where + * this gets mapped - we already have contiguous memory areas + * ready for us to map into. + */ + virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (virtaddr == MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, + strerror(errno)); + close(fd); + goto out; + } + + hf->orig_va = virtaddr; + + /* In linux, hugetlb limitations, like cgroup, are + * enforced at fault time instead of mmap(), even + * with the option of MAP_POPULATE. Kernel will send + * a SIGBUS signal. To avoid to be killed, save stack + * environment here, if SIGBUS happens, we can jump + * back here. + */ + if (huge_wrap_sigsetjmp()) { + RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more " + "hugepages of size %u MB\n", + (unsigned int)(hugepage_sz / 0x100000)); + munmap(virtaddr, hugepage_sz); + close(fd); + unlink(hugepg_tbl[i].filepath); +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) + essential_memory[node_id] = + essential_prev; +#endif + goto out; + } + *(int *)virtaddr = 0; + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", + __func__, strerror(errno)); + close(fd); + goto out; + } + + close(fd); + } + +out: +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (maxnode) { + RTE_LOG(DEBUG, EAL, + "Restoring previous memory policy: %d\n", oldpolicy); + if (oldpolicy == MPOL_DEFAULT) { + numa_set_localalloc(); + } else if (set_mempolicy(oldpolicy, oldmask->maskp, + oldmask->size + 1) < 0) { + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", + strerror(errno)); + numa_set_localalloc(); + } + } + if (oldmask != NULL) + numa_free_cpumask(oldmask); +#endif + return i; +} + +/* + * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge + * page. + */ +static int +find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +{ + int socket_id; + char *end, *nodestr; + unsigned i, hp_count = 0; + uint64_t virt_addr; + char buf[BUFSIZ]; + char hugedir_str[PATH_MAX]; + FILE *f; + + f = fopen("/proc/self/numa_maps", "r"); + if (f == NULL) { + RTE_LOG(NOTICE, EAL, "NUMA support not available" + " consider that all memory is in socket_id 0\n"); + return 0; + } + + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, eal_get_hugefile_prefix()); + + /* parse numa map */ + while (fgets(buf, sizeof(buf), f) != NULL) { + + /* ignore non huge page */ + if (strstr(buf, " huge ") == NULL && + strstr(buf, hugedir_str) == NULL) + continue; + + /* get zone addr */ + virt_addr = strtoull(buf, &end, 16); + if (virt_addr == 0 || end == buf) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* get node id (socket id) */ + nodestr = strstr(buf, " N"); + if (nodestr == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + nodestr += 2; + end = strstr(nodestr, "="); + if (end == NULL) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + end[0] = '\0'; + end = NULL; + + socket_id = strtoul(nodestr, &end, 0); + if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) { + RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__); + goto error; + } + + /* if we find this page in our mappings, set socket_id */ + for (i = 0; i < hpi->num_pages[0]; i++) { + void *va = (void *)(unsigned long)virt_addr; + if (hugepg_tbl[i].orig_va == va) { + hugepg_tbl[i].socket_id = socket_id; + hp_count++; +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES + RTE_LOG(DEBUG, EAL, + "Hugepage %s is on socket %d\n", + hugepg_tbl[i].filepath, socket_id); +#endif + } + } + } + + if (hp_count < hpi->num_pages[0]) + goto error; + + fclose(f); + return 0; + +error: + fclose(f); + return -1; +} + +static int +cmp_physaddr(const void *a, const void *b) +{ +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = a; + const struct hugepage_file *p2 = b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = b; + const struct hugepage_file *p2 = a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; +} + +/* + * Uses mmap to create a shared memory area for storage of data + * Used in this file to store the hugepage file map on disk + */ +static void * +create_shared_memory(const char *filename, const size_t mem_size) +{ + void *retval; + int fd; + + /* if no shared files mode is used, create anonymous memory instead */ + if (internal_config.no_shconf) { + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (retval == MAP_FAILED) + return NULL; + return retval; + } + + fd = open(filename, O_CREAT | O_RDWR, 0600); + if (fd < 0) + return NULL; + if (ftruncate(fd, mem_size) < 0) { + close(fd); + return NULL; + } + retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (retval == MAP_FAILED) + return NULL; + return retval; +} + +/* + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, + const struct hugepage_file * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].orig_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file)); + dst_pos++; + } + } + return 0; +} + +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->orig_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + uint64_t unmap_len; + + unmap_len = hp->size; + + /* get start addr and len of the remaining segment */ + munmap(hp->orig_va, + (size_t)unmap_len); + + hp->orig_va = NULL; + if (unlink(hp->filepath) == -1) { + RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + return -1; + } + } else { + /* lock the page and skip */ + pages_found++; + } + + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static int +remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct rte_memseg_list *msl; + struct rte_fbarray *arr; + int cur_page, seg_len; + unsigned int msl_idx; + int ms_idx; + uint64_t page_sz; + size_t memseg_len; + int socket_id; + + page_sz = hugepages[seg_start].size; + socket_id = hugepages[seg_start].socket_id; + seg_len = seg_end - seg_start; + + RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20ULL, socket_id); + + /* find free space in memseg lists */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + bool empty; + msl = &mcfg->memsegs[msl_idx]; + arr = &msl->memseg_arr; + + if (msl->page_sz != page_sz) + continue; + if (msl->socket_id != socket_id) + continue; + + /* leave space for a hole if array is not empty */ + empty = arr->count == 0; + ms_idx = rte_fbarray_find_next_n_free(arr, 0, + seg_len + (empty ? 0 : 1)); + + /* memseg list is full? */ + if (ms_idx < 0) + continue; + + /* leave some space between memsegs, they are not IOVA + * contiguous, so they shouldn't be VA contiguous either. + */ + if (!empty) + ms_idx++; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE), + RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE)); + return -1; + } + +#ifdef RTE_ARCH_PPC_64 + /* for PPC64 we go through the list backwards */ + for (cur_page = seg_end - 1; cur_page >= seg_start; + cur_page--, ms_idx++) { +#else + for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) { +#endif + struct hugepage_file *hfile = &hugepages[cur_page]; + struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx); + void *addr; + int fd; + + fd = open(hfile->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open '%s': %s\n", + hfile->filepath, strerror(errno)); + return -1; + } + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + memseg_len = (size_t)page_sz; + addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len); + + /* we know this address is already mmapped by memseg list, so + * using MAP_FIXED here is safe + */ + addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n", + hfile->filepath, strerror(errno)); + close(fd); + return -1; + } + + /* we have a new address, so unmap previous one */ +#ifndef RTE_ARCH_64 + /* in 32-bit legacy mode, we have already unmapped the page */ + if (!internal_config.legacy_mem) + munmap(hfile->orig_va, page_sz); +#else + munmap(hfile->orig_va, page_sz); +#endif + + hfile->orig_va = NULL; + hfile->final_va = addr; + + /* rewrite physical addresses in IOVA as VA mode */ + if (rte_eal_iova_mode() == RTE_IOVA_VA) + hfile->physaddr = (uintptr_t)addr; + + /* set up memseg data */ + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->len = memseg_len; + ms->iova = hfile->physaddr; + ms->socket_id = hfile->socket_id; + ms->nchannel = rte_memory_get_nchannel(); + ms->nrank = rte_memory_get_nrank(); + + rte_fbarray_set_used(arr, ms_idx); + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n", + (seg_len * page_sz) >> 20, socket_id); + return 0; +} + +static uint64_t +get_mem_amount(uint64_t page_sz, uint64_t max_mem) +{ + uint64_t area_sz, max_pages; + + /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */ + max_pages = RTE_MAX_MEMSEG_PER_LIST; + max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem); + + area_sz = RTE_MIN(page_sz * max_pages, max_mem); + + /* make sure the list isn't smaller than the page size */ + area_sz = RTE_MAX(area_sz, page_sz); + + return RTE_ALIGN(area_sz, page_sz); +} + +static int +free_memseg_list(struct rte_memseg_list *msl) +{ + if (rte_fbarray_destroy(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n"); + return -1; + } + memset(msl, 0, sizeof(*msl)); + return 0; +} + +#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i" +static int +alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz, + int n_segs, int socket_id, int type_msl_idx) +{ + char name[RTE_FBARRAY_NAME_LEN]; + + snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id, + type_msl_idx); + if (rte_fbarray_init(&msl->memseg_arr, name, n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n", + rte_strerror(rte_errno)); + return -1; + } + + msl->page_sz = page_sz; + msl->socket_id = socket_id; + msl->base_va = NULL; + msl->heap = 1; /* mark it as a heap segment */ + + RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n", + (size_t)page_sz >> 10, socket_id); + + return 0; +} + +static int +alloc_va_space(struct rte_memseg_list *msl) +{ + uint64_t page_sz; + size_t mem_sz; + void *addr; + int flags = 0; + + page_sz = msl->page_sz; + mem_sz = page_sz * msl->memseg_arr.len; + + addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags); + if (addr == NULL) { + if (rte_errno == EADDRNOTAVAIL) + RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - " + "please use '--" OPT_BASE_VIRTADDR "' option\n", + (unsigned long long)mem_sz, msl->base_va); + else + RTE_LOG(ERR, EAL, "Cannot reserve memory\n"); + return -1; + } + msl->base_va = addr; + msl->len = mem_sz; + + return 0; +} + +/* + * Our VA space is not preallocated yet, so preallocate it here. We need to know + * how many segments there are in order to map all pages into one address space, + * and leave appropriate holes between segments so that rte_malloc does not + * concatenate them into one big segment. + * + * we also need to unmap original pages to free up address space. + */ +static int __rte_unused +prealloc_segments(struct hugepage_file *hugepages, int n_pages) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int cur_page, seg_start_page, end_seg, new_memseg; + unsigned int hpi_idx, socket, i; + int n_contig_segs, n_segs; + int msl_idx; + + /* before we preallocate segments, we need to free up our VA space. + * we're not removing files, and we already have information about + * PA-contiguousness, so it is safe to unmap everything. + */ + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *hpi = &hugepages[cur_page]; + munmap(hpi->orig_va, hpi->size); + hpi->orig_va = NULL; + } + + /* we cannot know how many page sizes and sockets we have discovered, so + * loop over all of them + */ + for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes; + hpi_idx++) { + uint64_t page_sz = + internal_config.hugepage_info[hpi_idx].hugepage_sz; + + for (i = 0; i < rte_socket_count(); i++) { + struct rte_memseg_list *msl; + + socket = rte_socket_id_by_idx(i); + n_contig_segs = 0; + n_segs = 0; + seg_start_page = -1; + + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + int prev_seg_start_page = -1; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : + &hugepages[cur_page - 1]; + + new_memseg = 0; + end_seg = 0; + + if (cur->size == 0) + end_seg = 1; + else if (cur->socket_id != (int) socket) + end_seg = 1; + else if (cur->size != page_sz) + end_seg = 1; + else if (cur_page == 0) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start + * from higher address to lower address. Here, + * physical addresses are in descending order. + */ + else if ((prev->physaddr - cur->physaddr) != + cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != + cur->size) + new_memseg = 1; +#endif + if (new_memseg) { + /* if we're already inside a segment, + * new segment means end of current one + */ + if (seg_start_page != -1) { + end_seg = 1; + prev_seg_start_page = + seg_start_page; + } + seg_start_page = cur_page; + } + + if (end_seg) { + if (prev_seg_start_page != -1) { + /* we've found a new segment */ + n_contig_segs++; + n_segs += cur_page - + prev_seg_start_page; + } else if (seg_start_page != -1) { + /* we didn't find new segment, + * but did end current one + */ + n_contig_segs++; + n_segs += cur_page - + seg_start_page; + seg_start_page = -1; + continue; + } else { + /* we're skipping this page */ + continue; + } + } + /* segment continues */ + } + /* check if we missed last segment */ + if (seg_start_page != -1) { + n_contig_segs++; + n_segs += cur_page - seg_start_page; + } + + /* if no segments were found, do not preallocate */ + if (n_segs == 0) + continue; + + /* we now have total number of pages that we will + * allocate for this segment list. add separator pages + * to the total count, and preallocate VA space. + */ + n_segs += n_contig_segs - 1; + + /* now, preallocate VA space for these segments */ + + /* first, find suitable memseg list for this */ + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; + msl_idx++) { + msl = &mcfg->memsegs[msl_idx]; + + if (msl->base_va != NULL) + continue; + break; + } + if (msl_idx == RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + /* now, allocate fbarray itself */ + if (alloc_memseg_list(msl, page_sz, n_segs, socket, + msl_idx) < 0) + return -1; + + /* finally, allocate VA space */ + if (alloc_va_space(msl) < 0) + return -1; + } + } + return 0; +} + +/* + * We cannot reallocate memseg lists on the fly because PPC64 stores pages + * backwards, therefore we have to process the entire memseg first before + * remapping it into memseg list VA space. + */ +static int +remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages) +{ + int cur_page, seg_start_page, new_memseg, ret; + + seg_start_page = 0; + for (cur_page = 0; cur_page < n_pages; cur_page++) { + struct hugepage_file *prev, *cur; + + new_memseg = 0; + + cur = &hugepages[cur_page]; + prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1]; + + /* if size is zero, no more pages left */ + if (cur->size == 0) + break; + + if (cur_page == 0) + new_memseg = 1; + else if (cur->socket_id != prev->socket_id) + new_memseg = 1; + else if (cur->size != prev->size) + new_memseg = 1; +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * address to lower address. Here, physical addresses are in + * descending order. + */ + else if ((prev->physaddr - cur->physaddr) != cur->size) + new_memseg = 1; +#else + else if ((cur->physaddr - prev->physaddr) != cur->size) + new_memseg = 1; +#endif + + if (new_memseg) { + /* if this isn't the first time, remap segment */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + /* remember where we started */ + seg_start_page = cur_page; + } + /* continuation of previous memseg */ + } + /* we were stopped, but we didn't remap the last segment, do it now */ + if (cur_page != 0) { + ret = remap_segment(hugepages, seg_start_page, + cur_page); + if (ret != 0) + return -1; + } + return 0; +} + +__rte_unused /* function is unused on 32-bit builds */ +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return size; +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the + * number of pages thereof, and calculates the best number of + * pages of each size to fulfill the request for ram + */ +static int +calc_num_pages_per_socket(uint64_t * memory, + struct hugepage_info *hp_info, + struct hugepage_info *hp_used, + unsigned num_hp_info) +{ + unsigned socket, j, i = 0; + unsigned requested, available; + int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + + if (num_hp_info == 0) + return -1; + + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + size_t total_size; +#ifdef RTE_ARCH_64 + int cpu_per_socket[RTE_MAX_NUMA_NODES]; + size_t default_size; + unsigned lcore_id; + + /* Compute number of cores per socket */ + memset(cpu_per_socket, 0, sizeof(cpu_per_socket)); + RTE_LCORE_FOREACH(lcore_id) { + cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++; + } + + /* + * Automatically spread requested memory amongst detected sockets according + * to number of cores from cpu mask present on each socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + + /* Set memory amount per socket */ + default_size = (internal_config.memory * cpu_per_socket[socket]) + / rte_lcore_count(); + + /* Limit to maximum available memory on socket */ + default_size = RTE_MIN(default_size, get_socket_mem_size(socket)); + + /* Update sizes */ + memory[socket] = default_size; + total_size -= default_size; + } + + /* + * If some memory is remaining, try to allocate it by getting all + * available memory from sockets, one after the other + */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) { + /* take whatever is available */ + default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket], + total_size); + + /* Update sizes */ + memory[socket] += default_size; + total_size -= default_size; + } +#else + /* in 32-bit mode, allocate all of the memory only on master + * lcore socket + */ + total_size = internal_config.memory; + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; + socket++) { + struct rte_config *cfg = rte_eal_get_configuration(); + unsigned int master_lcore_socket; + + master_lcore_socket = + rte_lcore_to_socket_id(cfg->master_lcore); + + if (master_lcore_socket != socket) + continue; + + /* Update sizes */ + memory[socket] = total_size; + break; + } +#endif + } + + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + strlcpy(hp_used[i].hugedir, hp_info[i].hugedir, + sizeof(hp_used[i].hugedir)); + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0 && + internal_config.socket_mem[socket] != 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; + } + } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } + return total_num_pages; +} + +static inline size_t +eal_get_hugepage_mem_size(void) +{ + uint64_t size = 0; + unsigned i, j; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } + } + + return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; +} + +static struct sigaction huge_action_old; +static int huge_need_recover; + +static void +huge_register_sigbus(void) +{ + sigset_t mask; + struct sigaction action; + + sigemptyset(&mask); + sigaddset(&mask, SIGBUS); + action.sa_flags = 0; + action.sa_mask = mask; + action.sa_handler = huge_sigbus_handler; + + huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); +} + +static void +huge_recover_sigbus(void) +{ + if (huge_need_recover) { + sigaction(SIGBUS, &huge_action_old, NULL); + huge_need_recover = 0; + } +} + +/* + * Prepare physical memory mapping: fill configuration structure with + * these infos, return 0 on success. + * 1. map N huge pages in separate files in hugetlbfs + * 2. find associated physical addr + * 3. find associated NUMA socket ID + * 4. sort all huge pages by physical address + * 5. remap these N huge pages in the correct order + * 6. unmap the first mapping + * 7. fill memsegs in configuration with contiguous zones + */ +static int +eal_legacy_hugepage_init(void) +{ + struct rte_mem_config *mcfg; + struct hugepage_file *hugepage = NULL, *tmp_hp = NULL; + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + struct rte_fbarray *arr; + struct rte_memseg *ms; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; + int i, j; + int nr_hugefiles, nr_hugepages = 0; + void *addr; + + memset(used_hp, 0, sizeof(used_hp)); + + /* get pointer to global configuration */ + mcfg = rte_eal_get_configuration()->mem_config; + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { + void *prealloc_addr; + size_t mem_sz; + struct rte_memseg_list *msl; + int n_segs, cur_seg, fd, flags; +#ifdef MEMFD_SUPPORTED + int memfd; +#endif + uint64_t page_sz; + + /* nohuge mode is legacy mode */ + internal_config.legacy_mem = 1; + + /* nohuge mode is single-file segments mode */ + internal_config.single_file_segments = 1; + + /* create a memseg list */ + msl = &mcfg->memsegs[0]; + + page_sz = RTE_PGSIZE_4K; + n_segs = internal_config.memory / page_sz; + + if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs, + sizeof(struct rte_memseg))) { + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + /* set up parameters for anonymous mmap */ + fd = -1; + flags = MAP_PRIVATE | MAP_ANONYMOUS; + +#ifdef MEMFD_SUPPORTED + /* create a memfd and store it in the segment fd table */ + memfd = memfd_create("nohuge", 0); + if (memfd < 0) { + RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n", + strerror(errno)); + RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n"); + } else { + /* we got an fd - now resize it */ + if (ftruncate(memfd, internal_config.memory) < 0) { + RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n", + strerror(errno)); + RTE_LOG(ERR, EAL, "Falling back to anonymous map\n"); + close(memfd); + } else { + /* creating memfd-backed file was successful. + * we want changes to memfd to be visible to + * other processes (such as vhost backend), so + * map it as shared memory. + */ + RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n"); + fd = memfd; + flags = MAP_SHARED; + } + } +#endif + /* preallocate address space for the memory, so that it can be + * fit into the DMA mask. + */ + mem_sz = internal_config.memory; + prealloc_addr = eal_get_virtual_area( + NULL, &mem_sz, page_sz, 0, 0); + if (prealloc_addr == NULL) { + RTE_LOG(ERR, EAL, + "%s: reserving memory area failed: " + "%s\n", + __func__, strerror(errno)); + return -1; + } + addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE, + flags | MAP_FIXED, fd, 0); + if (addr == MAP_FAILED || addr != prealloc_addr) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + munmap(prealloc_addr, mem_sz); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; + msl->len = mem_sz; + msl->heap = 1; + + /* we're in single-file segments mode, so only the segment list + * fd needs to be set up. + */ + if (fd != -1) { + if (eal_memalloc_set_seg_list_fd(0, fd) < 0) { + RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n"); + /* not a serious error, proceed */ + } + } + + /* populate memsegs. each memseg is one page long */ + for (cur_seg = 0; cur_seg < n_segs; cur_seg++) { + arr = &msl->memseg_arr; + + ms = rte_fbarray_get(arr, cur_seg); + if (rte_eal_iova_mode() == RTE_IOVA_VA) + ms->iova = (uintptr_t)addr; + else + ms->iova = RTE_BAD_IOVA; + ms->addr = addr; + ms->hugepage_sz = page_sz; + ms->socket_id = 0; + ms->len = page_sz; + + rte_fbarray_set_used(arr, cur_seg); + + addr = RTE_PTR_ADD(addr, (size_t)page_sz); + } + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + if (rte_eal_iova_mode() == RTE_IOVA_VA && + rte_eal_using_phys_addrs()) + RTE_LOG(ERR, EAL, + "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n", + __func__); + goto fail; + } + return 0; + } + + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + nr_hugepages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file)); + + hp_offset = 0; /* where we start the current page size entries */ + + huge_register_sigbus(); + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* map all hugepages and sort them */ + for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ + unsigned pages_old, pages_new; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + + if (hpi->num_pages[0] == 0) + continue; + + /* map all hugepages available */ + pages_old = hpi->num_pages[0]; + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory); + if (pages_new < pages_old) { + RTE_LOG(DEBUG, EAL, + "%d not %d hugepages of size %u MB allocated\n", + pages_new, pages_old, + (unsigned)(hpi->hugepage_sz / 0x100000)); + + int pages = pages_old - pages_new; + + nr_hugepages -= pages; + hpi->num_pages[0] = pages_new; + if (pages_new == 0) + continue; + } + + if (rte_eal_using_phys_addrs() && + rte_eal_iova_mode() != RTE_IOVA_VA) { + /* find physical addresses for each hugepage */ + if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to find phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } else { + /* set physical addresses for each hugepage */ + if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) { + RTE_LOG(DEBUG, EAL, "Failed to set phys addr " + "for %u MB pages\n", + (unsigned int)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + } + + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ + RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", + (unsigned)(hpi->hugepage_sz / 0x100000)); + goto fail; + } + + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); + + /* we have processed a num of hugepages of this size, so inc offset */ + hp_offset += hpi->num_pages[0]; + } + + huge_recover_sigbus(); + + if (internal_config.memory == 0 && internal_config.force_sockets == 0) + internal_config.memory = eal_get_hugepage_mem_size(); + + nr_hugefiles = nr_hugepages; + + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < nr_hugefiles; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nr_hugepages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nr_hugepages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_data_path(), + nr_hugefiles * sizeof(struct hugepage_file)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; + } + memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file)); + + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have orig_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles, + tmp_hp, nr_hugefiles) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + +#ifndef RTE_ARCH_64 + /* for legacy 32-bit mode, we did not preallocate VA space, so do it */ + if (internal_config.legacy_mem && + prealloc_segments(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n"); + goto fail; + } +#endif + + /* remap all pages we do need into memseg list VA space, so that those + * pages become first-class citizens in DPDK memory subsystem + */ + if (remap_needed_hugepages(hugepage, nr_hugefiles)) { + RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n"); + goto fail; + } + + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + hugepage = NULL; + + /* we're not going to allocate more pages, so release VA space for + * unused memseg lists + */ + for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) { + struct rte_memseg_list *msl = &mcfg->memsegs[i]; + size_t mem_sz; + + /* skip inactive lists */ + if (msl->base_va == NULL) + continue; + /* skip lists where there is at least one page allocated */ + if (msl->memseg_arr.count > 0) + continue; + /* this is an unused list, deallocate it */ + mem_sz = msl->len; + munmap(msl->base_va, mem_sz); + msl->base_va = NULL; + msl->heap = 0; + + /* destroy backing fbarray */ + rte_fbarray_destroy(&msl->memseg_arr); + } + + if (mcfg->dma_maskbits && + rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) { + RTE_LOG(ERR, EAL, + "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n", + __func__); + goto fail; + } + + return 0; + +fail: + huge_recover_sigbus(); + free(tmp_hp); + if (hugepage != NULL) + munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file)); + + return -1; +} + +static int __rte_unused +hugepage_count_walk(const struct rte_memseg_list *msl, void *arg) +{ + struct hugepage_info *hpi = arg; + + if (msl->page_sz != hpi->hugepage_sz) + return 0; + + hpi->num_pages[msl->socket_id] += msl->memseg_arr.len; + return 0; +} + +static int +limits_callback(int socket_id, size_t cur_limit, size_t new_len) +{ + RTE_SET_USED(socket_id); + RTE_SET_USED(cur_limit); + RTE_SET_USED(new_len); + return -1; +} + +static int +eal_hugepage_init(void) +{ + struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + uint64_t memory[RTE_MAX_NUMA_NODES]; + int hp_sz_idx, socket_id; + + memset(used_hp, 0, sizeof(used_hp)); + + for (hp_sz_idx = 0; + hp_sz_idx < (int) internal_config.num_hugepage_sizes; + hp_sz_idx++) { +#ifndef RTE_ARCH_64 + struct hugepage_info dummy; + unsigned int i; +#endif + /* also initialize used_hp hugepage sizes in used_hp */ + struct hugepage_info *hpi; + hpi = &internal_config.hugepage_info[hp_sz_idx]; + used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz; + +#ifndef RTE_ARCH_64 + /* for 32-bit, limit number of pages on socket to whatever we've + * preallocated, as we cannot allocate more. + */ + memset(&dummy, 0, sizeof(dummy)); + dummy.hugepage_sz = hpi->hugepage_sz; + if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0) + return -1; + + for (i = 0; i < RTE_DIM(dummy.num_pages); i++) { + hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i], + dummy.num_pages[i]); + } +#endif + } + + /* make a copy of socket_mem, needed for balanced allocation. */ + for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++) + memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx]; + + /* calculate final number of pages */ + if (calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes) < 0) + return -1; + + for (hp_sz_idx = 0; + hp_sz_idx < (int)internal_config.num_hugepage_sizes; + hp_sz_idx++) { + for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES; + socket_id++) { + struct rte_memseg **pages; + struct hugepage_info *hpi = &used_hp[hp_sz_idx]; + unsigned int num_pages = hpi->num_pages[socket_id]; + unsigned int num_pages_alloc; + + if (num_pages == 0) + continue; + + RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n", + num_pages, hpi->hugepage_sz >> 20, socket_id); + + /* we may not be able to allocate all pages in one go, + * because we break up our memory map into multiple + * memseg lists. therefore, try allocating multiple + * times and see if we can get the desired number of + * pages from multiple allocations. + */ + + num_pages_alloc = 0; + do { + int i, cur_pages, needed; + + needed = num_pages - num_pages_alloc; + + pages = malloc(sizeof(*pages) * needed); + + /* do not request exact number of pages */ + cur_pages = eal_memalloc_alloc_seg_bulk(pages, + needed, hpi->hugepage_sz, + socket_id, false); + if (cur_pages <= 0) { + free(pages); + return -1; + } + + /* mark preallocated pages as unfreeable */ + for (i = 0; i < cur_pages; i++) { + struct rte_memseg *ms = pages[i]; + ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE; + } + free(pages); + + num_pages_alloc += cur_pages; + } while (num_pages_alloc != num_pages); + } + } + /* if socket limits were specified, set them */ + if (internal_config.force_socket_limits) { + unsigned int i; + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) { + uint64_t limit = internal_config.socket_limit[i]; + if (limit == 0) + continue; + if (rte_mem_alloc_validator_register("socket-limit", + limits_callback, i, limit)) + RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n"); + } + } + return 0; +} + +/* + * uses fstat to report the size of a file on disk + */ +static off_t +getFileSize(int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) + return 0; + return st.st_size; +} + +/* + * This creates the memory mappings in the secondary process to match that of + * the server process. It goes through each memory segment in the DPDK runtime + * configuration and finds the hugepages which form that segment, mapping them + * in order to form a contiguous block in the virtual memory space + */ +static int +eal_legacy_hugepage_attach(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct hugepage_file *hp = NULL; + unsigned int num_hp = 0; + unsigned int i = 0; + unsigned int cur_seg; + off_t size = 0; + int fd, fd_hugepage = -1; + + if (aslr_enabled() > 0) { + RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization " + "(ASLR) is enabled in the kernel.\n"); + RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory " + "into secondary processes\n"); + } + + fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY); + if (fd_hugepage < 0) { + RTE_LOG(ERR, EAL, "Could not open %s\n", + eal_hugepage_data_path()); + goto error; + } + + size = getFileSize(fd_hugepage); + hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0); + if (hp == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not mmap %s\n", + eal_hugepage_data_path()); + goto error; + } + + num_hp = size / sizeof(struct hugepage_file); + RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp); + + /* map all segments into memory to make sure we get the addrs. the + * segments themselves are already in memseg list (which is shared and + * has its VA space already preallocated), so we just need to map + * everything into correct addresses. + */ + for (i = 0; i < num_hp; i++) { + struct hugepage_file *hf = &hp[i]; + size_t map_sz = hf->size; + void *map_addr = hf->final_va; + int msl_idx, ms_idx; + struct rte_memseg_list *msl; + struct rte_memseg *ms; + + /* if size is zero, no more pages left */ + if (map_sz == 0) + break; + + fd = open(hf->filepath, O_RDWR); + if (fd < 0) { + RTE_LOG(ERR, EAL, "Could not open %s: %s\n", + hf->filepath, strerror(errno)); + goto error; + } + + map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (map_addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "Could not map %s: %s\n", + hf->filepath, strerror(errno)); + goto fd_error; + } + + /* set shared lock on the file. */ + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); + goto mmap_error; + } + + /* find segment data */ + msl = rte_mem_virt2memseg_list(map_addr); + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); + goto mmap_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); + goto mmap_error; + } + + msl_idx = msl - mcfg->memsegs; + ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); + goto mmap_error; + } + + /* store segment fd internally */ + if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0) + RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n", + rte_strerror(rte_errno)); + } + /* unmap the hugepage config file, since we are done using it */ + munmap(hp, size); + close(fd_hugepage); + return 0; + +mmap_error: + munmap(hp[i].final_va, hp[i].size); +fd_error: + close(fd); +error: + /* unwind mmap's done so far */ + for (cur_seg = 0; cur_seg < i; cur_seg++) + munmap(hp[cur_seg].final_va, hp[cur_seg].size); + + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_hugepage >= 0) + close(fd_hugepage); + return -1; +} + +static int +eal_hugepage_attach(void) +{ + if (eal_memalloc_sync_with_primary()) { + RTE_LOG(ERR, EAL, "Could not map memory from primary process\n"); + if (aslr_enabled() > 0) + RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n"); + return -1; + } + return 0; +} + +int +rte_eal_hugepage_init(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_init() : + eal_hugepage_init(); +} + +int +rte_eal_hugepage_attach(void) +{ + return internal_config.legacy_mem ? + eal_legacy_hugepage_attach() : + eal_hugepage_attach(); +} + +int +rte_eal_using_phys_addrs(void) +{ + if (phys_addrs_available == -1) { + uint64_t tmp = 0; + + if (rte_eal_has_hugepages() != 0 && + rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR) + phys_addrs_available = 1; + else + phys_addrs_available = 0; + } + return phys_addrs_available; +} + +static int __rte_unused +memseg_primary_init_32(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int active_sockets, hpi_idx, msl_idx = 0; + unsigned int socket_id, i; + struct rte_memseg_list *msl; + uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem; + uint64_t max_mem; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* this is a giant hack, but desperate times call for desperate + * measures. in legacy 32-bit mode, we cannot preallocate VA space, + * because having upwards of 2 gigabytes of VA space already mapped will + * interfere with our ability to map and sort hugepages. + * + * therefore, in legacy 32-bit mode, we will be initializing memseg + * lists much later - in eal_memory.c, right after we unmap all the + * unneeded pages. this will not affect secondary processes, as those + * should be able to mmap the space without (too many) problems. + */ + if (internal_config.legacy_mem) + return 0; + + /* 32-bit mode is a very special case. we cannot know in advance where + * the user will want to allocate their memory, so we have to do some + * heuristics. + */ + active_sockets = 0; + total_requested_mem = 0; + if (internal_config.force_sockets) + for (i = 0; i < rte_socket_count(); i++) { + uint64_t mem; + + socket_id = rte_socket_id_by_idx(i); + mem = internal_config.socket_mem[socket_id]; + + if (mem == 0) + continue; + + active_sockets++; + total_requested_mem += mem; + } + else + total_requested_mem = internal_config.memory; + + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + if (total_requested_mem > max_mem) { + RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n", + (unsigned int)(max_mem >> 20)); + return -1; + } + total_extra_mem = max_mem - total_requested_mem; + extra_mem_per_socket = active_sockets == 0 ? total_extra_mem : + total_extra_mem / active_sockets; + + /* the allocation logic is a little bit convoluted, but here's how it + * works, in a nutshell: + * - if user hasn't specified on which sockets to allocate memory via + * --socket-mem, we allocate all of our memory on master core socket. + * - if user has specified sockets to allocate memory on, there may be + * some "unused" memory left (e.g. if user has specified --socket-mem + * such that not all memory adds up to 2 gigabytes), so add it to all + * sockets that are in use equally. + * + * page sizes are sorted by size in descending order, so we can safely + * assume that we dispense with bigger page sizes first. + */ + + /* create memseg lists */ + for (i = 0; i < rte_socket_count(); i++) { + int hp_sizes = (int) internal_config.num_hugepage_sizes; + uint64_t max_socket_mem, cur_socket_mem; + unsigned int master_lcore_socket; + struct rte_config *cfg = rte_eal_get_configuration(); + bool skip; + + socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + /* we can still sort pages by socket in legacy mode */ + if (!internal_config.legacy_mem && socket_id > 0) + break; +#endif + + /* if we didn't specifically request memory on this socket */ + skip = active_sockets != 0 && + internal_config.socket_mem[socket_id] == 0; + /* ...or if we didn't specifically request memory on *any* + * socket, and this is not master lcore + */ + master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore); + skip |= active_sockets == 0 && socket_id != master_lcore_socket; + + if (skip) { + RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n", + socket_id); + continue; + } + + /* max amount of memory on this socket */ + max_socket_mem = (active_sockets != 0 ? + internal_config.socket_mem[socket_id] : + internal_config.memory) + + extra_mem_per_socket; + cur_socket_mem = 0; + + for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) { + uint64_t max_pagesz_mem, cur_pagesz_mem = 0; + uint64_t hugepage_sz; + struct hugepage_info *hpi; + int type_msl_idx, max_segs, total_segs = 0; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + /* check if pages are actually available */ + if (hpi->num_pages[socket_id] == 0) + continue; + + max_segs = RTE_MAX_MEMSEG_PER_TYPE; + max_pagesz_mem = max_socket_mem - cur_socket_mem; + + /* make it multiple of page size */ + max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem, + hugepage_sz); + + RTE_LOG(DEBUG, EAL, "Attempting to preallocate " + "%" PRIu64 "M on socket %i\n", + max_pagesz_mem >> 20, socket_id); + + type_msl_idx = 0; + while (cur_pagesz_mem < max_pagesz_mem && + total_segs < max_segs) { + uint64_t cur_mem; + unsigned int n_segs; + + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + return -1; + } + + msl = &mcfg->memsegs[msl_idx]; + + cur_mem = get_mem_amount(hugepage_sz, + max_pagesz_mem); + n_segs = cur_mem / hugepage_sz; + + if (alloc_memseg_list(msl, hugepage_sz, n_segs, + socket_id, type_msl_idx)) { + /* failing to allocate a memseg list is + * a serious error. + */ + RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n"); + return -1; + } + + if (alloc_va_space(msl)) { + /* if we couldn't allocate VA space, we + * can try with smaller page sizes. + */ + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n"); + /* deallocate memseg list */ + if (free_memseg_list(msl)) + return -1; + break; + } + + total_segs += msl->memseg_arr.len; + cur_pagesz_mem = total_segs * hugepage_sz; + type_msl_idx++; + msl_idx++; + } + cur_socket_mem += cur_pagesz_mem; + } + if (cur_socket_mem == 0) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n", + socket_id); + return -1; + } + } + + return 0; +} + +static int __rte_unused +memseg_primary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + struct memtype { + uint64_t page_sz; + int socket_id; + } *memtypes = NULL; + int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */ + struct rte_memseg_list *msl; + uint64_t max_mem, max_mem_per_type; + unsigned int max_seglists_per_type; + unsigned int n_memtypes, cur_type; + + /* no-huge does not need this at all */ + if (internal_config.no_hugetlbfs) + return 0; + + /* + * figuring out amount of memory we're going to have is a long and very + * involved process. the basic element we're operating with is a memory + * type, defined as a combination of NUMA node ID and page size (so that + * e.g. 2 sockets with 2 page sizes yield 4 memory types in total). + * + * deciding amount of memory going towards each memory type is a + * balancing act between maximum segments per type, maximum memory per + * type, and number of detected NUMA nodes. the goal is to make sure + * each memory type gets at least one memseg list. + * + * the total amount of memory is limited by RTE_MAX_MEM_MB value. + * + * the total amount of memory per type is limited by either + * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number + * of detected NUMA nodes. additionally, maximum number of segments per + * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for + * smaller page sizes, it can take hundreds of thousands of segments to + * reach the above specified per-type memory limits. + * + * additionally, each type may have multiple memseg lists associated + * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger + * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones. + * + * the number of memseg lists per type is decided based on the above + * limits, and also taking number of detected NUMA nodes, to make sure + * that we don't run out of memseg lists before we populate all NUMA + * nodes with memory. + * + * we do this in three stages. first, we collect the number of types. + * then, we figure out memory constraints and populate the list of + * would-be memseg lists. then, we go ahead and allocate the memseg + * lists. + */ + + /* create space for mem types */ + n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count(); + memtypes = calloc(n_memtypes, sizeof(*memtypes)); + if (memtypes == NULL) { + RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n"); + return -1; + } + + /* populate mem types */ + cur_type = 0; + for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes; + hpi_idx++) { + struct hugepage_info *hpi; + uint64_t hugepage_sz; + + hpi = &internal_config.hugepage_info[hpi_idx]; + hugepage_sz = hpi->hugepage_sz; + + for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) { + int socket_id = rte_socket_id_by_idx(i); + +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + /* we can still sort pages by socket in legacy mode */ + if (!internal_config.legacy_mem && socket_id > 0) + break; +#endif + memtypes[cur_type].page_sz = hugepage_sz; + memtypes[cur_type].socket_id = socket_id; + + RTE_LOG(DEBUG, EAL, "Detected memory type: " + "socket_id:%u hugepage_sz:%" PRIu64 "\n", + socket_id, hugepage_sz); + } + } + /* number of memtypes could have been lower due to no NUMA support */ + n_memtypes = cur_type; + + /* set up limits for types */ + max_mem = (uint64_t)RTE_MAX_MEM_MB << 20; + max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20, + max_mem / n_memtypes); + /* + * limit maximum number of segment lists per type to ensure there's + * space for memseg lists for all NUMA nodes with all page sizes + */ + max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes; + + if (max_seglists_per_type == 0) { + RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + + /* go through all mem types and create segment lists */ + msl_idx = 0; + for (cur_type = 0; cur_type < n_memtypes; cur_type++) { + unsigned int cur_seglist, n_seglists, n_segs; + unsigned int max_segs_per_type, max_segs_per_list; + struct memtype *type = &memtypes[cur_type]; + uint64_t max_mem_per_list, pagesz; + int socket_id; + + pagesz = type->page_sz; + socket_id = type->socket_id; + + /* + * we need to create segment lists for this type. we must take + * into account the following things: + * + * 1. total amount of memory we can use for this memory type + * 2. total amount of memory per memseg list allowed + * 3. number of segments needed to fit the amount of memory + * 4. number of segments allowed per type + * 5. number of segments allowed per memseg list + * 6. number of memseg lists we are allowed to take up + */ + + /* calculate how much segments we will need in total */ + max_segs_per_type = max_mem_per_type / pagesz; + /* limit number of segments to maximum allowed per type */ + max_segs_per_type = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_TYPE); + /* limit number of segments to maximum allowed per list */ + max_segs_per_list = RTE_MIN(max_segs_per_type, + (unsigned int)RTE_MAX_MEMSEG_PER_LIST); + + /* calculate how much memory we can have per segment list */ + max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz, + (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20); + + /* calculate how many segments each segment list will have */ + n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz); + + /* calculate how many segment lists we can have */ + n_seglists = RTE_MIN(max_segs_per_type / n_segs, + max_mem_per_type / max_mem_per_list); + + /* limit number of segment lists according to our maximum */ + n_seglists = RTE_MIN(n_seglists, max_seglists_per_type); + + RTE_LOG(DEBUG, EAL, "Creating %i segment lists: " + "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n", + n_seglists, n_segs, socket_id, pagesz); + + /* create all segment lists */ + for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) { + if (msl_idx >= RTE_MAX_MEMSEG_LISTS) { + RTE_LOG(ERR, EAL, + "No more space in memseg lists, please increase %s\n", + RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS)); + goto out; + } + msl = &mcfg->memsegs[msl_idx++]; + + if (alloc_memseg_list(msl, pagesz, n_segs, + socket_id, cur_seglist)) + goto out; + + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n"); + goto out; + } + } + } + /* we're successful */ + ret = 0; +out: + free(memtypes); + return ret; +} + +static int +memseg_secondary_init(void) +{ + struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; + int msl_idx = 0; + struct rte_memseg_list *msl; + + for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) { + + msl = &mcfg->memsegs[msl_idx]; + + /* skip empty memseg lists */ + if (msl->memseg_arr.len == 0) + continue; + + if (rte_fbarray_attach(&msl->memseg_arr)) { + RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n"); + return -1; + } + + /* preallocate VA space */ + if (alloc_va_space(msl)) { + RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n"); + return -1; + } + } + + return 0; +} + +int +rte_eal_memseg_init(void) +{ + /* increase rlimit to maximum */ + struct rlimit lim; + + if (getrlimit(RLIMIT_NOFILE, &lim) == 0) { + /* set limit to maximum */ + lim.rlim_cur = lim.rlim_max; + + if (setrlimit(RLIMIT_NOFILE, &lim) < 0) { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n", + strerror(errno)); + } else { + RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %" + PRIu64 "\n", + (uint64_t)lim.rlim_cur); + } + } else { + RTE_LOG(ERR, EAL, "Cannot get current resource limits\n"); + } +#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES + if (!internal_config.legacy_mem && rte_socket_count() > 1) { + RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n"); + RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n"); + RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n"); + } +#endif + + return rte_eal_process_type() == RTE_PROC_PRIMARY ? +#ifndef RTE_ARCH_64 + memseg_primary_init_32() : +#else + memseg_primary_init() : +#endif + memseg_secondary_init(); +} diff --git a/lib/librte_eal/linux/eal_thread.c b/lib/librte_eal/linux/eal_thread.c new file mode 100644 index 0000000000..379773b683 --- /dev/null +++ b/lib/librte_eal/linux/eal_thread.c @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_thread.h" + +RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +/* set affinity for current EAL thread */ +static int +eal_thread_set_affinity(void) +{ + unsigned lcore_id = rte_lcore_id(); + + /* acquire system unique id */ + rte_gettid(); + + /* update EAL thread core affinity */ + return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset); +} + +void eal_thread_init_master(unsigned lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); +} + +/* main loop of threads */ +__attribute__((noreturn)) void * +eal_thread_loop(__attribute__((unused)) void *arg) +{ + char c; + int n, ret; + unsigned lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = pthread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + /* set CPU affinity */ + if (eal_thread_set_affinity() < 0) + rte_panic("cannot set affinity\n"); + + ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset)); + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "..."); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } + + /* never reached */ + /* pthread_exit(NULL); */ + /* return NULL; */ +} + +/* require calling thread tid by gettid() */ +int rte_sys_gettid(void) +{ + return (int)syscall(SYS_gettid); +} + +int rte_thread_setname(pthread_t id, const char *name) +{ + int ret = ENOSYS; +#if defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 12) + ret = pthread_setname_np(id, name); +#endif +#endif + RTE_SET_USED(id); + RTE_SET_USED(name); + return -ret; +} diff --git a/lib/librte_eal/linux/eal_timer.c b/lib/librte_eal/linux/eal_timer.c new file mode 100644 index 0000000000..a904a8297c --- /dev/null +++ b/lib/librte_eal/linux/eal_timer.c @@ -0,0 +1,232 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation. + * Copyright(c) 2012-2013 6WIND S.A. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" +#include "eal_internal_cfg.h" + +enum timer_source eal_timer_source = EAL_TIMER_HPET; + +#ifdef RTE_LIBEAL_USE_HPET + +#define DEV_HPET "/dev/hpet" + +/* Maximum number of counters. */ +#define HPET_TIMER_NUM 3 + +/* General capabilities register */ +#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */ +#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */ + +/** + * HPET timer registers. From the Intel IA-PC HPET (High Precision Event + * Timers) Specification. + */ +struct eal_hpet_regs { + /* Memory-mapped, software visible registers */ + uint64_t capabilities; /**< RO General Capabilities Register. */ + uint64_t reserved0; /**< Reserved for future use. */ + uint64_t config; /**< RW General Configuration Register. */ + uint64_t reserved1; /**< Reserved for future use. */ + uint64_t isr; /**< RW Clear General Interrupt Status. */ + uint64_t reserved2[25]; /**< Reserved for future use. */ + union { + uint64_t counter; /**< RW Main Counter Value Register. */ + struct { + uint32_t counter_l; /**< RW Main Counter Low. */ + uint32_t counter_h; /**< RW Main Counter High. */ + }; + }; + uint64_t reserved3; /**< Reserved for future use. */ + struct { + uint64_t config; /**< RW Timer Config and Capability Reg. */ + uint64_t comp; /**< RW Timer Comparator Value Register. */ + uint64_t fsb; /**< RW FSB Interrupt Route Register. */ + uint64_t reserved4; /**< Reserved for future use. */ + } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */ +}; + +/* Mmap'd hpet registers */ +static volatile struct eal_hpet_regs *eal_hpet = NULL; + +/* Period at which the HPET counter increments in + * femtoseconds (10^-15 seconds). */ +static uint32_t eal_hpet_resolution_fs = 0; + +/* Frequency of the HPET counter in Hz */ +static uint64_t eal_hpet_resolution_hz = 0; + +/* Incremented 4 times during one 32bits hpet full count */ +static uint32_t eal_hpet_msb; + +static pthread_t msb_inc_thread_id; + +/* + * This function runs on a specific thread to update a global variable + * containing used to process MSB of the HPET (unfortunately, we need + * this because hpet is 32 bits by default under linux). + */ +static void * +hpet_msb_inc(__attribute__((unused)) void *arg) +{ + uint32_t t; + + while (1) { + t = (eal_hpet->counter_l >> 30); + if (t != (eal_hpet_msb & 3)) + eal_hpet_msb ++; + sleep(10); + } + return NULL; +} + +uint64_t +rte_get_hpet_hz(void) +{ + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + return eal_hpet_resolution_hz; +} + +uint64_t +rte_get_hpet_cycles(void) +{ + uint32_t t, msb; + uint64_t ret; + + if(internal_config.no_hpet) + rte_panic("Error, HPET called, but no HPET present\n"); + + t = eal_hpet->counter_l; + msb = eal_hpet_msb; + ret = (msb + 2 - (t >> 30)) / 4; + ret <<= 32; + ret += t; + return ret; +} + +#endif + +#ifdef RTE_LIBEAL_USE_HPET +/* + * Open and mmap /dev/hpet (high precision event timer) that will + * provide our time reference. + */ +int +rte_eal_hpet_init(int make_default) +{ + int fd, ret; + + if (internal_config.no_hpet) { + RTE_LOG(NOTICE, EAL, "HPET is disabled\n"); + return -1; + } + + fd = open(DEV_HPET, O_RDONLY); + if (fd < 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n", + strerror(errno)); + internal_config.no_hpet = 1; + return -1; + } + eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0); + if (eal_hpet == MAP_FAILED) { + RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n" + "Please enable CONFIG_HPET_MMAP in your kernel configuration " + "to allow HPET support.\n" + "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n " + "in your build configuration or use '--no-hpet' EAL flag.\n"); + close(fd); + internal_config.no_hpet = 1; + return -1; + } + close(fd); + + eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities & + CLK_PERIOD_MASK) >> + CLK_PERIOD_SHIFT); + + eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) / + (uint64_t)eal_hpet_resolution_fs; + + RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n", + eal_hpet_resolution_hz/1000); + + eal_hpet_msb = (eal_hpet->counter_l >> 30); + + /* create a thread that will increment a global variable for + * msb (hpet is 32 bits by default under linux) */ + ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL, + hpet_msb_inc, NULL); + if (ret != 0) { + RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n"); + internal_config.no_hpet = 1; + return -1; + } + + if (make_default) + eal_timer_source = EAL_TIMER_HPET; + return 0; +} +#endif + +uint64_t +get_tsc_freq(void) +{ +#ifdef CLOCK_MONOTONIC_RAW +#define NS_PER_SEC 1E9 +#define CYC_PER_10MHZ 1E7 + + struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */ + + struct timespec t_start, t_end; + uint64_t tsc_hz; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) { + uint64_t ns, end, start = rte_rdtsc(); + nanosleep(&sleeptime,NULL); + clock_gettime(CLOCK_MONOTONIC_RAW, &t_end); + end = rte_rdtsc(); + ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC); + ns += (t_end.tv_nsec - t_start.tv_nsec); + + double secs = (double)ns/NS_PER_SEC; + tsc_hz = (uint64_t)((end - start)/secs); + /* Round up to 10Mhz. 1E7 ~ 10Mhz */ + return RTE_ALIGN_MUL_NEAR(tsc_hz, CYC_PER_10MHZ); + } +#endif + return 0; +} + +int +rte_eal_timer_init(void) +{ + + eal_timer_source = EAL_TIMER_TSC; + + set_tsc_freq(); + return 0; +} diff --git a/lib/librte_eal/linux/eal_vfio.c b/lib/librte_eal/linux/eal_vfio.c new file mode 100644 index 0000000000..4502aefed3 --- /dev/null +++ b/lib/librte_eal/linux/eal_vfio.c @@ -0,0 +1,2184 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_filesystem.h" +#include "eal_memcfg.h" +#include "eal_vfio.h" +#include "eal_private.h" + +#ifdef VFIO_PRESENT + +#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb" + +/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can + * recreate the mappings for DPDK segments, but we cannot do so for memory that + * was registered by the user themselves, so we need to store the user mappings + * somewhere, to recreate them later. + */ +#define VFIO_MAX_USER_MEM_MAPS 256 +struct user_mem_map { + uint64_t addr; + uint64_t iova; + uint64_t len; +}; + +struct user_mem_maps { + rte_spinlock_recursive_t lock; + int n_maps; + struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS]; +}; + +struct vfio_config { + int vfio_enabled; + int vfio_container_fd; + int vfio_active_groups; + const struct vfio_iommu_type *vfio_iommu_type; + struct vfio_group vfio_groups[VFIO_MAX_GROUPS]; + struct user_mem_maps mem_maps; +}; + +/* per-process VFIO config */ +static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS]; +static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0]; + +static int vfio_type1_dma_map(int); +static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_spapr_dma_map(int); +static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_noiommu_dma_map(int); +static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int); +static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, + uint64_t iova, uint64_t len, int do_map); + +/* IOMMU types we support */ +static const struct vfio_iommu_type iommu_types[] = { + /* x86 IOMMU, otherwise known as type 1 */ + { + .type_id = RTE_VFIO_TYPE1, + .name = "Type 1", + .dma_map_func = &vfio_type1_dma_map, + .dma_user_map_func = &vfio_type1_dma_mem_map + }, + /* ppc64 IOMMU, otherwise known as spapr */ + { + .type_id = RTE_VFIO_SPAPR, + .name = "sPAPR", + .dma_map_func = &vfio_spapr_dma_map, + .dma_user_map_func = &vfio_spapr_dma_mem_map + }, + /* IOMMU-less mode */ + { + .type_id = RTE_VFIO_NOIOMMU, + .name = "No-IOMMU", + .dma_map_func = &vfio_noiommu_dma_map, + .dma_user_map_func = &vfio_noiommu_dma_mem_map + }, +}; + +static int +is_null_map(const struct user_mem_map *map) +{ + return map->addr == 0 && map->iova == 0 && map->len == 0; +} + +/* we may need to merge user mem maps together in case of user mapping/unmapping + * chunks of memory, so we'll need a comparator function to sort segments. + */ +static int +user_mem_map_cmp(const void *a, const void *b) +{ + const struct user_mem_map *umm_a = a; + const struct user_mem_map *umm_b = b; + + /* move null entries to end */ + if (is_null_map(umm_a)) + return 1; + if (is_null_map(umm_b)) + return -1; + + /* sort by iova first */ + if (umm_a->iova < umm_b->iova) + return -1; + if (umm_a->iova > umm_b->iova) + return 1; + + if (umm_a->addr < umm_b->addr) + return -1; + if (umm_a->addr > umm_b->addr) + return 1; + + if (umm_a->len < umm_b->len) + return -1; + if (umm_a->len > umm_b->len) + return 1; + + return 0; +} + +/* adjust user map entry. this may result in shortening of existing map, or in + * splitting existing map in two pieces. + */ +static void +adjust_map(struct user_mem_map *src, struct user_mem_map *end, + uint64_t remove_va_start, uint64_t remove_len) +{ + /* if va start is same as start address, we're simply moving start */ + if (remove_va_start == src->addr) { + src->addr += remove_len; + src->iova += remove_len; + src->len -= remove_len; + } else if (remove_va_start + remove_len == src->addr + src->len) { + /* we're shrinking mapping from the end */ + src->len -= remove_len; + } else { + /* we're blowing a hole in the middle */ + struct user_mem_map tmp; + uint64_t total_len = src->len; + + /* adjust source segment length */ + src->len = remove_va_start - src->addr; + + /* create temporary segment in the middle */ + tmp.addr = src->addr + src->len; + tmp.iova = src->iova + src->len; + tmp.len = remove_len; + + /* populate end segment - this one we will be keeping */ + end->addr = tmp.addr + tmp.len; + end->iova = tmp.iova + tmp.len; + end->len = total_len - src->len - tmp.len; + } +} + +/* try merging two maps into one, return 1 if succeeded */ +static int +merge_map(struct user_mem_map *left, struct user_mem_map *right) +{ + if (left->addr + left->len != right->addr) + return 0; + if (left->iova + left->len != right->iova) + return 0; + + left->len += right->len; + + memset(right, 0, sizeof(*right)); + + return 1; +} + +static struct user_mem_map * +find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr, + uint64_t iova, uint64_t len) +{ + uint64_t va_end = addr + len; + uint64_t iova_end = iova + len; + int i; + + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = &user_mem_maps->maps[i]; + uint64_t map_va_end = map->addr + map->len; + uint64_t map_iova_end = map->iova + map->len; + + /* check start VA */ + if (addr < map->addr || addr >= map_va_end) + continue; + /* check if VA end is within boundaries */ + if (va_end <= map->addr || va_end > map_va_end) + continue; + + /* check start IOVA */ + if (iova < map->iova || iova >= map_iova_end) + continue; + /* check if IOVA end is within boundaries */ + if (iova_end <= map->iova || iova_end > map_iova_end) + continue; + + /* we've found our map */ + return map; + } + return NULL; +} + +/* this will sort all user maps, and merge/compact any adjacent maps */ +static void +compact_user_maps(struct user_mem_maps *user_mem_maps) +{ + int i, n_merged, cur_idx; + + qsort(user_mem_maps->maps, user_mem_maps->n_maps, + sizeof(user_mem_maps->maps[0]), user_mem_map_cmp); + + /* we'll go over the list backwards when merging */ + n_merged = 0; + for (i = user_mem_maps->n_maps - 2; i >= 0; i--) { + struct user_mem_map *l, *r; + + l = &user_mem_maps->maps[i]; + r = &user_mem_maps->maps[i + 1]; + + if (is_null_map(l) || is_null_map(r)) + continue; + + if (merge_map(l, r)) + n_merged++; + } + + /* the entries are still sorted, but now they have holes in them, so + * walk through the list and remove the holes + */ + if (n_merged > 0) { + cur_idx = 0; + for (i = 0; i < user_mem_maps->n_maps; i++) { + if (!is_null_map(&user_mem_maps->maps[i])) { + struct user_mem_map *src, *dst; + + src = &user_mem_maps->maps[i]; + dst = &user_mem_maps->maps[cur_idx++]; + + if (src != dst) { + memcpy(dst, src, sizeof(*src)); + memset(src, 0, sizeof(*src)); + } + } + } + user_mem_maps->n_maps = cur_idx; + } +} + +static int +vfio_open_group_fd(int iommu_group_num) +{ + int vfio_group_fd; + char filename[PATH_MAX]; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + /* if primary, try to open the group */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* try regular group format */ + snprintf(filename, sizeof(filename), + VFIO_GROUP_FMT, iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + /* if file not found, it's not an error */ + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + + /* special case: try no-IOMMU path as well */ + snprintf(filename, sizeof(filename), + VFIO_NOIOMMU_GROUP_FMT, + iommu_group_num); + vfio_group_fd = open(filename, O_RDWR); + if (vfio_group_fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename, + strerror(errno)); + return -1; + } + return 0; + } + /* noiommu group found */ + } + + return vfio_group_fd; + } + /* if we're in a secondary process, request group fd from the primary + * process via mp channel. + */ + p->req = SOCKET_REQ_GROUP; + p->group_num = iommu_group_num; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_group_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_group_fd = mp_rep->fds[0]; + } else if (p->result == SOCKET_NO_FD) { + RTE_LOG(ERR, EAL, " bad VFIO group fd\n"); + vfio_group_fd = 0; + } + } + + free(mp_reply.msgs); + if (vfio_group_fd < 0) + RTE_LOG(ERR, EAL, " cannot request group fd\n"); + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_num(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + if (vfio_cfg->vfio_groups[j].group_num == + iommu_group_num) + return vfio_cfg; + } + } + + return NULL; +} + +static int +vfio_get_group_fd(struct vfio_config *vfio_cfg, + int iommu_group_num) +{ + int i; + int vfio_group_fd; + struct vfio_group *cur_grp; + + /* check if we already have the group descriptor open */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) + return vfio_cfg->vfio_groups[i].fd; + + /* Lets see first if there is room for a new group */ + if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n"); + return -1; + } + + /* Now lets get an index for the new group */ + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num == -1) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS) { + RTE_LOG(ERR, EAL, "No VFIO group free slot found\n"); + return -1; + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); + if (vfio_group_fd < 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } + + cur_grp->group_num = iommu_group_num; + cur_grp->fd = vfio_group_fd; + vfio_cfg->vfio_active_groups++; + + return vfio_group_fd; +} + +static struct vfio_config * +get_vfio_cfg_by_group_fd(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return vfio_cfg; + } + + return NULL; +} + +static struct vfio_config * +get_vfio_cfg_by_container_fd(int container_fd) +{ + int i; + + if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD) + return default_vfio_cfg; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == container_fd) + return &vfio_cfgs[i]; + } + + return NULL; +} + +int +rte_vfio_get_group_fd(int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +static int +get_vfio_group_idx(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i, j; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfg = &vfio_cfgs[i]; + for (j = 0; j < VFIO_MAX_GROUPS; j++) + if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd) + return j; + } + + return -1; +} + +static void +vfio_group_device_get(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices++; +} + +static void +vfio_group_device_put(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + else + vfio_cfg->vfio_groups[i].devices--; +} + +static int +vfio_group_device_count(int vfio_group_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) { + RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i); + return -1; + } + + return vfio_cfg->vfio_groups[i].devices; +} + +static void +vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + void *arg __rte_unused) +{ + rte_iova_t iova_start, iova_expected; + struct rte_memseg_list *msl; + struct rte_memseg *ms; + size_t cur_len = 0; + uint64_t va_start; + + msl = rte_mem_virt2memseg_list(addr); + + /* for IOVA as VA mode, no need to care for IOVA addresses */ + if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) { + uint64_t vfio_va = (uint64_t)(uintptr_t)addr; + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 1); + else + vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va, + len, 0); + return; + } + +#ifdef RTE_ARCH_PPC_64 + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + + rte_fbarray_set_free(&msl->memseg_arr, idx); + cur_len += ms->len; + ++ms; + } + cur_len = 0; +#endif + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + + /* + * This memory is not guaranteed to be contiguous, but it still could + * be, or it could have some small contiguous chunks. Since the number + * of VFIO mappings is limited, and VFIO appears to not concatenate + * adjacent mappings, we have to do this ourselves. + * + * So, find contiguous chunks, then map them. + */ + va_start = ms->addr_64; + iova_start = iova_expected = ms->iova; + while (cur_len < len) { + bool new_contig_area = ms->iova != iova_expected; + bool last_seg = (len - cur_len) == ms->len; + bool skip_last = false; + + /* only do mappings when current contiguous area ends */ + if (new_contig_area) { + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 1); + else + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 0); + va_start = ms->addr_64; + iova_start = ms->iova; + } + /* some memory segments may have invalid IOVA */ + if (ms->iova == RTE_BAD_IOVA) { + RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n", + ms->addr); + skip_last = true; + } + iova_expected = ms->iova + ms->len; + cur_len += ms->len; + ++ms; + + /* + * don't count previous segment, and don't attempt to + * dereference a potentially invalid pointer. + */ + if (skip_last && !last_seg) { + iova_expected = iova_start = ms->iova; + va_start = ms->addr_64; + } else if (!skip_last && last_seg) { + /* this is the last segment and we're not skipping */ + if (type == RTE_MEM_EVENT_ALLOC) + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 1); + else + vfio_dma_mem_map(default_vfio_cfg, va_start, + iova_start, + iova_expected - iova_start, 0); + } + } +#ifdef RTE_ARCH_PPC_64 + cur_len = 0; + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { + int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); + + rte_fbarray_set_used(&msl->memseg_arr, idx); + cur_len += ms->len; + ++ms; + } +#endif +} + +static int +vfio_sync_default_container(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + int iommu_type_id; + unsigned int i; + + /* cannot be called from primary */ + if (rte_eal_process_type() != RTE_PROC_SECONDARY) + return -1; + + /* default container fd should have been opened in rte_vfio_enable() */ + if (!default_vfio_cfg->vfio_enabled || + default_vfio_cfg->vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, "VFIO support is not initialized\n"); + return -1; + } + + /* find default container's IOMMU type */ + p->req = SOCKET_REQ_IOMMU_TYPE; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + iommu_type_id = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK) + iommu_type_id = p->iommu_type_id; + } + free(mp_reply.msgs); + if (iommu_type_id < 0) { + RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n"); + return -1; + } + + /* we now have an fd for default container, as well as its IOMMU type. + * now, set up default VFIO container config to match. + */ + for (i = 0; i < RTE_DIM(iommu_types); i++) { + const struct vfio_iommu_type *t = &iommu_types[i]; + if (t->type_id != iommu_type_id) + continue; + + /* we found our IOMMU type */ + default_vfio_cfg->vfio_iommu_type = t; + + return 0; + } + RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n", + iommu_type_id); + return -1; +} + +int +rte_vfio_clear_group(int vfio_group_fd) +{ + int i; + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid group fd!\n"); + return -1; + } + + i = get_vfio_group_idx(vfio_group_fd); + if (i < 0) + return -1; + vfio_cfg->vfio_groups[i].group_num = -1; + vfio_cfg->vfio_groups[i].fd = -1; + vfio_cfg->vfio_groups[i].devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr, + int *vfio_dev_fd, struct vfio_device_info *device_info) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int vfio_container_fd; + int vfio_group_fd; + int iommu_group_num; + int i, ret; + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* if negative, something failed */ + if (ret < 0) + return -1; + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd < 0) + return -1; + + /* if group_fd == 0, that means the device isn't managed by VFIO */ + if (vfio_group_fd == 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n", + dev_addr); + return 1; + } + + /* + * at this point, we know that this group is viable (meaning, all devices + * are either bound to VFIO or not bound to anything) + */ + + /* check if the group is viable */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get group status, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + RTE_LOG(ERR, EAL, " %s VFIO group is not viable! " + "Not all devices in IOMMU group bound to VFIO or unbound\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + vfio_container_fd = vfio_cfg->vfio_container_fd; + user_mem_maps = &vfio_cfg->mem_maps; + + /* check if group does not have a container yet */ + if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) { + + /* add group to a container */ + ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER, + &vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, " + "error %i (%s)\n", dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* + * pick an IOMMU type and set up DMA mappings for container + * + * needs to be done only once, only when first group is + * assigned to a container and only in primary process. + * Note this can happen several times with the hotplug + * functionality. + */ + if (internal_config.process_type == RTE_PROC_PRIMARY && + vfio_cfg->vfio_active_groups == 1 && + vfio_group_device_count(vfio_group_fd) == 0) { + const struct vfio_iommu_type *t; + + /* select an IOMMU type which we will be using */ + t = vfio_set_iommu_type(vfio_container_fd); + if (!t) { + RTE_LOG(ERR, EAL, + " %s failed to select IOMMU type\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* lock memory hotplug before mapping and release it + * after registering callback, to prevent races + */ + rte_mcfg_mem_read_lock(); + if (vfio_cfg == default_vfio_cfg) + ret = t->dma_map_func(vfio_container_fd); + else + ret = 0; + if (ret) { + RTE_LOG(ERR, EAL, + " %s DMA remapping failed, error %i (%s)\n", + dev_addr, errno, strerror(errno)); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + rte_mcfg_mem_read_unlock(); + return -1; + } + + vfio_cfg->vfio_iommu_type = t; + + /* re-map all user-mapped segments */ + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* this IOMMU type may not support DMA mapping, but + * if we have mappings in the list - that means we have + * previously mapped something successfully, so we can + * be sure that DMA mapping is supported. + */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map; + map = &user_mem_maps->maps[i]; + + ret = t->dma_user_map_func( + vfio_container_fd, + map->addr, map->iova, map->len, + 1); + if (ret) { + RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: " + "va: 0x%" PRIx64 " " + "iova: 0x%" PRIx64 " " + "len: 0x%" PRIu64 "\n", + map->addr, map->iova, + map->len); + rte_spinlock_recursive_unlock( + &user_mem_maps->lock); + rte_mcfg_mem_read_unlock(); + return -1; + } + } + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + + /* register callback for mem events */ + if (vfio_cfg == default_vfio_cfg) + ret = rte_mem_event_callback_register( + VFIO_MEM_EVENT_CLB_NAME, + vfio_mem_event_callback, NULL); + else + ret = 0; + /* unlock memory hotplug */ + rte_mcfg_mem_read_unlock(); + + if (ret && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n"); + return -1; + } + if (ret) + RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n"); + else + RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n"); + } + } else if (rte_eal_process_type() != RTE_PROC_PRIMARY && + vfio_cfg == default_vfio_cfg && + vfio_cfg->vfio_iommu_type == NULL) { + /* if we're not a primary process, we do not set up the VFIO + * container because it's already been set up by the primary + * process. instead, we simply ask the primary about VFIO type + * we are using, and set the VFIO config up appropriately. + */ + ret = vfio_sync_default_container(); + if (ret < 0) { + RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n"); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + /* we have successfully initialized VFIO, notify user */ + const struct vfio_iommu_type *t = + default_vfio_cfg->vfio_iommu_type; + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + } + + /* get a file descriptor for the device */ + *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr); + if (*vfio_dev_fd < 0) { + /* if we cannot get a device fd, this implies a problem with + * the VFIO group or the container not having IOMMU configured. + */ + + RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n", + dev_addr); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + + /* test and setup the device */ + ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info); + if (ret) { + RTE_LOG(ERR, EAL, " %s cannot get device info, " + "error %i (%s)\n", dev_addr, errno, + strerror(errno)); + close(*vfio_dev_fd); + close(vfio_group_fd); + rte_vfio_clear_group(vfio_group_fd); + return -1; + } + vfio_group_device_get(vfio_group_fd); + + return 0; +} + +int +rte_vfio_release_device(const char *sysfs_base, const char *dev_addr, + int vfio_dev_fd) +{ + struct vfio_group_status group_status = { + .argsz = sizeof(group_status) + }; + struct vfio_config *vfio_cfg; + int vfio_group_fd; + int iommu_group_num; + int ret; + + /* we don't want any DMA mapping messages to come while we're detaching + * VFIO device, because this might be the last device and we might need + * to unregister the callback. + */ + rte_mcfg_mem_read_lock(); + + /* get group number */ + ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num); + if (ret <= 0) { + RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n", + dev_addr); + /* This is an error at this point. */ + ret = -1; + goto out; + } + + /* get the actual group fd */ + vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num); + if (vfio_group_fd <= 0) { + RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* get the vfio_config it belongs to */ + vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num); + vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg; + + /* At this point we got an active group. Closing it will make the + * container detachment. If this is the last active group, VFIO kernel + * code will unset the container and the IOMMU mappings. + */ + + /* Closing a device */ + if (close(vfio_dev_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + /* An VFIO group can have several devices attached. Just when there is + * no devices remaining should the group be closed. + */ + vfio_group_device_put(vfio_group_fd); + if (!vfio_group_device_count(vfio_group_fd)) { + + if (close(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n", + dev_addr); + ret = -1; + goto out; + } + + if (rte_vfio_clear_group(vfio_group_fd) < 0) { + RTE_LOG(INFO, EAL, "Error when clearing group for %s\n", + dev_addr); + ret = -1; + goto out; + } + } + + /* if there are no active device groups, unregister the callback to + * avoid spurious attempts to map/unmap memory from VFIO. + */ + if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 && + rte_eal_process_type() != RTE_PROC_SECONDARY) + rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME, + NULL); + + /* success */ + ret = 0; + +out: + rte_mcfg_mem_read_unlock(); + return ret; +} + +int +rte_vfio_enable(const char *modname) +{ + /* initialize group list */ + int i, j; + int vfio_available; + + rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER; + + for (i = 0; i < VFIO_MAX_CONTAINERS; i++) { + vfio_cfgs[i].vfio_container_fd = -1; + vfio_cfgs[i].vfio_active_groups = 0; + vfio_cfgs[i].vfio_iommu_type = NULL; + vfio_cfgs[i].mem_maps.lock = lock; + + for (j = 0; j < VFIO_MAX_GROUPS; j++) { + vfio_cfgs[i].vfio_groups[j].fd = -1; + vfio_cfgs[i].vfio_groups[j].group_num = -1; + vfio_cfgs[i].vfio_groups[j].devices = 0; + } + } + + /* inform the user that we are probing for VFIO */ + RTE_LOG(INFO, EAL, "Probing VFIO support...\n"); + + /* check if vfio module is loaded */ + vfio_available = rte_eal_check_module(modname); + + /* return error directly */ + if (vfio_available == -1) { + RTE_LOG(INFO, EAL, "Could not get loaded module details!\n"); + return -1; + } + + /* return 0 if VFIO modules not loaded */ + if (vfio_available == 0) { + RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, " + "skipping VFIO support...\n"); + return 0; + } + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* open a new container */ + default_vfio_cfg->vfio_container_fd = + rte_vfio_get_container_fd(); + } else { + /* get the default container from the primary process */ + default_vfio_cfg->vfio_container_fd = + vfio_get_default_container_fd(); + } + + /* check if we have VFIO driver enabled */ + if (default_vfio_cfg->vfio_container_fd != -1) { + RTE_LOG(NOTICE, EAL, "VFIO support initialized\n"); + default_vfio_cfg->vfio_enabled = 1; + } else { + RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n"); + } + + return 0; +} + +int +rte_vfio_is_enabled(const char *modname) +{ + const int mod_available = rte_eal_check_module(modname) > 0; + return default_vfio_cfg->vfio_enabled && mod_available; +} + +int +vfio_get_default_container_fd(void) +{ + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; + + if (internal_config.process_type == RTE_PROC_PRIMARY) { + /* if we were secondary process we would try requesting + * container fd from the primary, but we're the primary + * process so just exit here + */ + return -1; + } + + p->req = SOCKET_REQ_DEFAULT_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + free(mp_reply.msgs); + return mp_rep->fds[0]; + } + } + + free(mp_reply.msgs); + RTE_LOG(ERR, EAL, " cannot request default container fd\n"); + return -1; +} + +int +vfio_get_iommu_type(void) +{ + if (default_vfio_cfg->vfio_iommu_type == NULL) + return -1; + + return default_vfio_cfg->vfio_iommu_type->type_id; +} + +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd) +{ + unsigned idx; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU, + t->type_id); + if (!ret) { + RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n", + t->type_id, t->name); + return t; + } + /* not an error, there may be more supported IOMMU types */ + RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, " + "error %i (%s)\n", t->type_id, t->name, errno, + strerror(errno)); + } + /* if we didn't find a suitable IOMMU type, fail */ + return NULL; +} + +int +vfio_has_supported_extensions(int vfio_container_fd) +{ + int ret; + unsigned idx, n_extensions = 0; + for (idx = 0; idx < RTE_DIM(iommu_types); idx++) { + const struct vfio_iommu_type *t = &iommu_types[idx]; + + ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION, + t->type_id); + if (ret < 0) { + RTE_LOG(ERR, EAL, " could not get IOMMU type, " + "error %i (%s)\n", errno, + strerror(errno)); + close(vfio_container_fd); + return -1; + } else if (ret == 1) { + /* we found a supported extension */ + n_extensions++; + } + RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n", + t->type_id, t->name, + ret ? "supported" : "not supported"); + } + + /* if we didn't find any supported IOMMU types, fail */ + if (!n_extensions) { + close(vfio_container_fd); + return -1; + } + + return 0; +} + +int +rte_vfio_get_container_fd(void) +{ + int ret, vfio_container_fd; + struct rte_mp_msg mp_req, *mp_rep; + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; + + + /* if we're in a primary process, try to open the container */ + if (internal_config.process_type == RTE_PROC_PRIMARY) { + vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR); + if (vfio_container_fd < 0) { + RTE_LOG(ERR, EAL, " cannot open VFIO container, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* check VFIO API version */ + ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION); + if (ret != VFIO_API_VERSION) { + if (ret < 0) + RTE_LOG(ERR, EAL, " could not get VFIO API version, " + "error %i (%s)\n", errno, strerror(errno)); + else + RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n"); + close(vfio_container_fd); + return -1; + } + + ret = vfio_has_supported_extensions(vfio_container_fd); + if (ret) { + RTE_LOG(ERR, EAL, " no supported IOMMU " + "extensions found!\n"); + return -1; + } + + return vfio_container_fd; + } + /* + * if we're in a secondary process, request container fd from the + * primary process via mp channel + */ + p->req = SOCKET_REQ_CONTAINER; + strcpy(mp_req.name, EAL_VFIO_MP); + mp_req.len_param = sizeof(*p); + mp_req.num_fds = 0; + + vfio_container_fd = -1; + if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 && + mp_reply.nb_received == 1) { + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { + vfio_container_fd = mp_rep->fds[0]; + free(mp_reply.msgs); + return vfio_container_fd; + } + } + + free(mp_reply.msgs); + RTE_LOG(ERR, EAL, " cannot request container fd\n"); + return -1; +} + +int +rte_vfio_get_group_num(const char *sysfs_base, + const char *dev_addr, int *iommu_group_num) +{ + char linkname[PATH_MAX]; + char filename[PATH_MAX]; + char *tok[16], *group_tok, *end; + int ret; + + memset(linkname, 0, sizeof(linkname)); + memset(filename, 0, sizeof(filename)); + + /* try to find out IOMMU group for this device */ + snprintf(linkname, sizeof(linkname), + "%s/%s/iommu_group", sysfs_base, dev_addr); + + ret = readlink(linkname, filename, sizeof(filename)); + + /* if the link doesn't exist, no VFIO for us */ + if (ret < 0) + return 0; + + ret = rte_strsplit(filename, sizeof(filename), + tok, RTE_DIM(tok), '/'); + + if (ret <= 0) { + RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr); + return -1; + } + + /* IOMMU group is always the last token */ + errno = 0; + group_tok = tok[ret - 1]; + end = group_tok; + *iommu_group_num = strtol(group_tok, &end, 10); + if ((end != group_tok && *end != '\0') || errno != 0) { + RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr); + return -1; + } + + return 1; +} + +static int +type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + size_t len, void *arg) +{ + int *vfio_container_fd = arg; + + if (msl->external) + return 0; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + len, 1); +} + +static int +type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms, + void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + /* if IOVA mode is VA, we've already mapped the internal segments */ + if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA) + return 0; + + return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + + if (do_map != 0) { + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + /** + * In case the mapping was already done EEXIST will be + * returned from kernel. + */ + if (errno == EEXIST) { + RTE_LOG(DEBUG, EAL, + " Memory segment is already mapped," + " skipping"); + } else { + RTE_LOG(ERR, EAL, + " cannot set up DMA remapping," + " error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_type1_dma_map(int vfio_container_fd) +{ + if (rte_eal_iova_mode() == RTE_IOVA_VA) { + /* with IOVA as VA mode, we can get away with mapping contiguous + * chunks rather than going page-by-page. + */ + int ret = rte_memseg_contig_walk(type1_map_contig, + &vfio_container_fd); + if (ret) + return ret; + /* we have to continue the walk because we've skipped the + * external segments during the config walk. + */ + } + return rte_memseg_walk(type1_map, &vfio_container_fd); +} + +static int +vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct vfio_iommu_type1_dma_map dma_map; + struct vfio_iommu_type1_dma_unmap dma_unmap; + int ret; + struct vfio_iommu_spapr_register_memory reg = { + .argsz = sizeof(reg), + .flags = 0 + }; + reg.vaddr = (uintptr_t) vaddr; + reg.size = len; + + if (do_map != 0) { + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + memset(&dma_map, 0, sizeof(dma_map)); + dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map); + dma_map.vaddr = vaddr; + dma_map.size = len; + dma_map.iova = iova; + dma_map.flags = VFIO_DMA_MAP_FLAG_READ | + VFIO_DMA_MAP_FLAG_WRITE; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map); + if (ret) { + /** + * In case the mapping was already done EBUSY will be + * returned from kernel. + */ + if (errno == EBUSY) { + RTE_LOG(DEBUG, EAL, + " Memory segment is already mapped," + " skipping"); + } else { + RTE_LOG(ERR, EAL, + " cannot set up DMA remapping," + " error %i (%s)\n", errno, + strerror(errno)); + return -1; + } + } + + } else { + memset(&dma_unmap, 0, sizeof(dma_unmap)); + dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap); + dma_unmap.size = len; + dma_unmap.iova = iova; + + ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA, + &dma_unmap); + if (ret) { + RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®); + if (ret) { + RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n", + errno, strerror(errno)); + return -1; + } + } + + return 0; +} + +static int +vfio_spapr_map_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); +} + +static int +vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 0); +} + +struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +}; + +static int +vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) +{ + struct spapr_walk_param *param = arg; + uint64_t max = ms->iova + ms->len; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) + return 0; + + /* skip any segments with invalid IOVA addresses */ + if (ms->iova == RTE_BAD_IOVA) + return 0; + + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; + } + + return 0; +} + +static int +vfio_spapr_create_new_dma_window(int vfio_container_fd, + struct vfio_iommu_spapr_tce_create *create) { + struct vfio_iommu_spapr_tce_remove remove = { + .argsz = sizeof(remove), + }; + struct vfio_iommu_spapr_tce_info info = { + .argsz = sizeof(info), + }; + int ret; + + /* query spapr iommu info */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info); + if (ret) { + RTE_LOG(ERR, EAL, " cannot get iommu info, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* remove default DMA of 32 bit window */ + remove.start_addr = info.dma32_window_start; + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); + if (ret) { + RTE_LOG(ERR, EAL, " cannot remove default DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + + /* create new DMA window */ + ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create); + if (ret) { +#ifdef VFIO_IOMMU_SPAPR_INFO_DDW + /* try possible page_shift and levels for workaround */ + uint32_t levels; + + for (levels = create->levels + 1; + ret && levels <= info.ddw.levels; levels++) { + create->levels = levels; + ret = ioctl(vfio_container_fd, + VFIO_IOMMU_SPAPR_TCE_CREATE, create); + } +#endif + if (ret) { + RTE_LOG(ERR, EAL, " cannot create new DMA window, " + "error %i (%s)\n", errno, strerror(errno)); + return -1; + } + } + + if (create->start_addr != 0) { + RTE_LOG(ERR, EAL, " DMA window start address != 0\n"); + return -1; + } + + return 0; +} + +static int +vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + struct spapr_walk_param param; + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct vfio_config *vfio_cfg; + struct user_mem_maps *user_mem_maps; + int i, ret = 0; + + vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, " invalid container fd!\n"); + return -1; + } + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); + + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, + ¶m) < 0) { + RTE_LOG(ERR, EAL, "Could not get window size\n"); + ret = -1; + goto out; + } + + /* also check user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + uint64_t max = user_mem_maps->maps[i].iova + + user_mem_maps->maps[i].len; + param.window_size = RTE_MAX(param.window_size, max); + } + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (do_map) { + /* re-create window and remap the entire memory */ + if (iova + len > create.window_size) { + /* release all maps before recreating the window */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not release DMA maps\n"); + ret = -1; + goto out; + } + /* release all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 0)) { + RTE_LOG(ERR, EAL, "Could not release user DMA maps\n"); + ret = -1; + goto out; + } + } + create.window_size = rte_align64pow2(iova + len); + if (vfio_spapr_create_new_dma_window(vfio_container_fd, + &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + ret = -1; + goto out; + } + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, + &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; + } + /* remap all user maps */ + for (i = 0; i < user_mem_maps->n_maps; i++) { + struct user_mem_map *map = + &user_mem_maps->maps[i]; + if (vfio_spapr_dma_do_map(vfio_container_fd, + map->addr, map->iova, map->len, + 1)) { + RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n"); + ret = -1; + goto out; + } + } + } + if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) { + RTE_LOG(ERR, EAL, "Failed to map DMA\n"); + ret = -1; + goto out; + } + } else { + /* for unmap, check if iova within DMA window */ + if (iova > create.window_size) { + RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap"); + ret = -1; + goto out; + } + + vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0); + } +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +vfio_spapr_dma_map(int vfio_container_fd) +{ + struct vfio_iommu_spapr_tce_create create = { + .argsz = sizeof(create), + }; + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); + + /* sPAPR requires window size to be a power of 2 */ + create.window_size = rte_align64pow2(param.window_size); + create.page_shift = __builtin_ctzll(param.hugepage_sz); + create.levels = 1; + + if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) { + RTE_LOG(ERR, EAL, "Could not create new DMA window\n"); + return -1; + } + + /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */ + if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0) + return -1; + + return 0; +} + +static int +vfio_noiommu_dma_map(int __rte_unused vfio_container_fd) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd, + uint64_t __rte_unused vaddr, + uint64_t __rte_unused iova, uint64_t __rte_unused len, + int __rte_unused do_map) +{ + /* No-IOMMU mode does not need DMA mapping */ + return 0; +} + +static int +vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map) +{ + const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type; + + if (!t) { + RTE_LOG(ERR, EAL, " VFIO support not initialized\n"); + rte_errno = ENODEV; + return -1; + } + + if (!t->dma_user_map_func) { + RTE_LOG(ERR, EAL, + " VFIO custom DMA region maping not supported by IOMMU %s\n", + t->name); + rte_errno = ENOTSUP; + return -1; + } + + return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova, + len, do_map); +} + +static int +container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *new_map; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "No more space for user mem maps\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + /* map the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) { + /* technically, this will fail if there are currently no devices + * plugged in, even if a device were added later, this mapping + * might have succeeded. however, since we cannot verify if this + * is a valid mapping without having a device attached, consider + * this to be unsupported, because we can't just store any old + * mapping and pollute list of active mappings willy-nilly. + */ + RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n"); + ret = -1; + goto out; + } + /* create new user mem map entry */ + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + new_map->addr = vaddr; + new_map->iova = iova; + new_map->len = len; + + compact_user_maps(user_mem_maps); +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +static int +container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct user_mem_map *map, *new_map = NULL; + struct user_mem_maps *user_mem_maps; + int ret = 0; + + user_mem_maps = &vfio_cfg->mem_maps; + rte_spinlock_recursive_lock(&user_mem_maps->lock); + + /* find our mapping */ + map = find_user_mem_map(user_mem_maps, vaddr, iova, len); + if (!map) { + RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n"); + rte_errno = EINVAL; + ret = -1; + goto out; + } + if (map->addr != vaddr || map->iova != iova || map->len != len) { + /* we're partially unmapping a previously mapped region, so we + * need to split entry into two. + */ + if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) { + RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n"); + rte_errno = ENOMEM; + ret = -1; + goto out; + } + new_map = &user_mem_maps->maps[user_mem_maps->n_maps++]; + } + + /* unmap the entry */ + if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) { + /* there may not be any devices plugged in, so unmapping will + * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't + * stop us from removing the mapping, as the assumption is we + * won't be needing this memory any more and thus will want to + * prevent it from being remapped again on hotplug. so, only + * fail if we indeed failed to unmap (e.g. if the mapping was + * within our mapped range but had invalid alignment). + */ + if (rte_errno != ENODEV && rte_errno != ENOTSUP) { + RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n"); + ret = -1; + goto out; + } else { + RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n"); + } + } + /* remove map from the list of active mappings */ + if (new_map != NULL) { + adjust_map(map, new_map, vaddr, len); + + /* if we've created a new map by splitting, sort everything */ + if (!is_null_map(new_map)) { + compact_user_maps(user_mem_maps); + } else { + /* we've created a new mapping, but it was unused */ + user_mem_maps->n_maps--; + } + } else { + memset(map, 0, sizeof(*map)); + compact_user_maps(user_mem_maps); + user_mem_maps->n_maps--; + } + +out: + rte_spinlock_recursive_unlock(&user_mem_maps->lock); + return ret; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + int fd; + ssize_t cnt; + char c; + + fd = open(VFIO_NOIOMMU_MODE, O_RDONLY); + if (fd < 0) { + if (errno != ENOENT) { + RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n", + errno, strerror(errno)); + return -1; + } + /* + * else the file does not exists + * i.e. noiommu is not enabled + */ + return 0; + } + + cnt = read(fd, &c, 1); + close(fd); + if (cnt != 1) { + RTE_LOG(ERR, EAL, " unable to read from vfio noiommu " + "file %i (%s)\n", errno, strerror(errno)); + return -1; + } + + return c == 'Y'; +} + +int +rte_vfio_container_create(void) +{ + int i; + + /* Find an empty slot to store new vfio config */ + for (i = 1; i < VFIO_MAX_CONTAINERS; i++) { + if (vfio_cfgs[i].vfio_container_fd == -1) + break; + } + + if (i == VFIO_MAX_CONTAINERS) { + RTE_LOG(ERR, EAL, "exceed max vfio container limit\n"); + return -1; + } + + vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd(); + if (vfio_cfgs[i].vfio_container_fd < 0) { + RTE_LOG(NOTICE, EAL, "fail to create a new container\n"); + return -1; + } + + return vfio_cfgs[i].vfio_container_fd; +} + +int +rte_vfio_container_destroy(int container_fd) +{ + struct vfio_config *vfio_cfg; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) + if (vfio_cfg->vfio_groups[i].group_num != -1) + rte_vfio_container_group_unbind(container_fd, + vfio_cfg->vfio_groups[i].group_num); + + close(container_fd); + vfio_cfg->vfio_container_fd = -1; + vfio_cfg->vfio_active_groups = 0; + vfio_cfg->vfio_iommu_type = NULL; + + return 0; +} + +int +rte_vfio_container_group_bind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return vfio_get_group_fd(vfio_cfg, iommu_group_num); +} + +int +rte_vfio_container_group_unbind(int container_fd, int iommu_group_num) +{ + struct vfio_config *vfio_cfg; + struct vfio_group *cur_grp = NULL; + int i; + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + for (i = 0; i < VFIO_MAX_GROUPS; i++) { + if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) { + cur_grp = &vfio_cfg->vfio_groups[i]; + break; + } + } + + /* This should not happen */ + if (i == VFIO_MAX_GROUPS || cur_grp == NULL) { + RTE_LOG(ERR, EAL, "Specified group number not found\n"); + return -1; + } + + if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) { + RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for" + " iommu_group_num %d\n", iommu_group_num); + return -1; + } + cur_grp->group_num = -1; + cur_grp->fd = -1; + cur_grp->devices = 0; + vfio_cfg->vfio_active_groups--; + + return 0; +} + +int +rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_map(vfio_cfg, vaddr, iova, len); +} + +int +rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova, + uint64_t len) +{ + struct vfio_config *vfio_cfg; + + if (len == 0) { + rte_errno = EINVAL; + return -1; + } + + vfio_cfg = get_vfio_cfg_by_container_fd(container_fd); + if (vfio_cfg == NULL) { + RTE_LOG(ERR, EAL, "Invalid container fd\n"); + return -1; + } + + return container_dma_unmap(vfio_cfg, vaddr, iova, len); +} + +#else + +int +rte_vfio_setup_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *vfio_dev_fd, + __rte_unused struct vfio_device_info *device_info) +{ + return -1; +} + +int +rte_vfio_release_device(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, __rte_unused int fd) +{ + return -1; +} + +int +rte_vfio_enable(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_is_enabled(__rte_unused const char *modname) +{ + return -1; +} + +int +rte_vfio_noiommu_is_enabled(void) +{ + return -1; +} + +int +rte_vfio_clear_group(__rte_unused int vfio_group_fd) +{ + return -1; +} + +int +rte_vfio_get_group_num(__rte_unused const char *sysfs_base, + __rte_unused const char *dev_addr, + __rte_unused int *iommu_group_num) +{ + return -1; +} + +int +rte_vfio_get_container_fd(void) +{ + return -1; +} + +int +rte_vfio_get_group_fd(__rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_create(void) +{ + return -1; +} + +int +rte_vfio_container_destroy(__rte_unused int container_fd) +{ + return -1; +} + +int +rte_vfio_container_group_bind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_group_unbind(__rte_unused int container_fd, + __rte_unused int iommu_group_num) +{ + return -1; +} + +int +rte_vfio_container_dma_map(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +int +rte_vfio_container_dma_unmap(__rte_unused int container_fd, + __rte_unused uint64_t vaddr, + __rte_unused uint64_t iova, + __rte_unused uint64_t len) +{ + return -1; +} + +#endif /* VFIO_PRESENT */ diff --git a/lib/librte_eal/linux/eal_vfio.h b/lib/librte_eal/linux/eal_vfio.h new file mode 100644 index 0000000000..cb2d35fb12 --- /dev/null +++ b/lib/librte_eal/linux/eal_vfio.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation + */ + +#ifndef EAL_VFIO_H_ +#define EAL_VFIO_H_ + +#include + +/* + * determine if VFIO is present on the system + */ +#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO) +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0) +#define VFIO_PRESENT +#else +#pragma message("VFIO configured but not supported by this kernel, disabling.") +#endif /* kernel version >= 3.6.0 */ +#endif /* RTE_EAL_VFIO */ + +#ifdef VFIO_PRESENT + +#include +#include + +#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU + +#ifndef VFIO_SPAPR_TCE_v2_IOMMU +#define RTE_VFIO_SPAPR 7 +#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17) +#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18) +#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19) +#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20) + +struct vfio_iommu_spapr_register_memory { + uint32_t argsz; + uint32_t flags; + uint64_t vaddr; + uint64_t size; +}; + +struct vfio_iommu_spapr_tce_create { + uint32_t argsz; + uint32_t flags; + /* in */ + uint32_t page_shift; + uint32_t __resv1; + uint64_t window_size; + uint32_t levels; + uint32_t __resv2; + /* out */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_remove { + uint32_t argsz; + uint32_t flags; + /* in */ + uint64_t start_addr; +}; + +struct vfio_iommu_spapr_tce_ddw_info { + uint64_t pgsizes; + uint32_t max_dynamic_windows_supported; + uint32_t levels; +}; + +/* SPAPR_v2 is not present, but SPAPR might be */ +#ifndef VFIO_SPAPR_TCE_IOMMU +#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12) + +struct vfio_iommu_spapr_tce_info { + uint32_t argsz; + uint32_t flags; + uint32_t dma32_window_start; + uint32_t dma32_window_size; + struct vfio_iommu_spapr_tce_ddw_info ddw; +}; +#endif /* VFIO_SPAPR_TCE_IOMMU */ + +#else /* VFIO_SPAPR_TCE_v2_IOMMU */ +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU +#endif + +#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS +#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS + +/* + * we don't need to store device fd's anywhere since they can be obtained from + * the group fd via an ioctl() call. + */ +struct vfio_group { + int group_num; + int fd; + int devices; +}; + +/* DMA mapping function prototype. + * Takes VFIO container fd as a parameter. + * Returns 0 on success, -1 on error. + * */ +typedef int (*vfio_dma_func_t)(int); + +/* Custom memory region DMA mapping function prototype. + * Takes VFIO container fd, virtual address, phisical address, length and + * operation type (0 to unmap 1 for map) as a parameters. + * Returns 0 on success, -1 on error. + **/ +typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova, + uint64_t len, int do_map); + +struct vfio_iommu_type { + int type_id; + const char *name; + vfio_dma_user_func_t dma_user_map_func; + vfio_dma_func_t dma_map_func; +}; + +/* get the vfio container that devices are bound to by default */ +int vfio_get_default_container_fd(void); + +/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */ +const struct vfio_iommu_type * +vfio_set_iommu_type(int vfio_container_fd); + +int +vfio_get_iommu_type(void); + +/* check if we have any supported extensions */ +int +vfio_has_supported_extensions(int vfio_container_fd); + +int vfio_mp_sync_setup(void); + +#define EAL_VFIO_MP "eal_vfio_mp_sync" + +#define SOCKET_REQ_CONTAINER 0x100 +#define SOCKET_REQ_GROUP 0x200 +#define SOCKET_REQ_DEFAULT_CONTAINER 0x400 +#define SOCKET_REQ_IOMMU_TYPE 0x800 +#define SOCKET_OK 0x0 +#define SOCKET_NO_FD 0x1 +#define SOCKET_ERR 0xFF + +struct vfio_mp_param { + int req; + int result; + RTE_STD_C11 + union { + int group_num; + int iommu_type_id; + }; +}; + +#endif /* VFIO_PRESENT */ + +#endif /* EAL_VFIO_H_ */ diff --git a/lib/librte_eal/linux/eal_vfio_mp_sync.c b/lib/librte_eal/linux/eal_vfio_mp_sync.c new file mode 100644 index 0000000000..5f2a5fc1d9 --- /dev/null +++ b/lib/librte_eal/linux/eal_vfio_mp_sync.c @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2018 Intel Corporation + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "eal_vfio.h" + +/** + * @file + * VFIO socket for communication between primary and secondary processes. + * + * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y". + */ + +#ifdef VFIO_PRESENT + +static int +vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer) +{ + int fd = -1; + int ret; + struct rte_mp_msg reply; + struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param; + const struct vfio_mp_param *m = + (const struct vfio_mp_param *)msg->param; + + if (msg->len_param != sizeof(*m)) { + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + memset(&reply, 0, sizeof(reply)); + + switch (m->req) { + case SOCKET_REQ_GROUP: + r->req = SOCKET_REQ_GROUP; + r->group_num = m->group_num; + fd = rte_vfio_get_group_fd(m->group_num); + if (fd < 0) + r->result = SOCKET_ERR; + else if (fd == 0) + /* if VFIO group exists but isn't bound to VFIO driver */ + r->result = SOCKET_NO_FD; + else { + /* if group exists and is bound to VFIO driver */ + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_CONTAINER: + r->req = SOCKET_REQ_CONTAINER; + fd = rte_vfio_get_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_DEFAULT_CONTAINER: + r->req = SOCKET_REQ_DEFAULT_CONTAINER; + fd = vfio_get_default_container_fd(); + if (fd < 0) + r->result = SOCKET_ERR; + else { + r->result = SOCKET_OK; + reply.num_fds = 1; + reply.fds[0] = fd; + } + break; + case SOCKET_REQ_IOMMU_TYPE: + { + int iommu_type_id; + + r->req = SOCKET_REQ_IOMMU_TYPE; + + iommu_type_id = vfio_get_iommu_type(); + + if (iommu_type_id < 0) + r->result = SOCKET_ERR; + else { + r->iommu_type_id = iommu_type_id; + r->result = SOCKET_OK; + } + break; + } + default: + RTE_LOG(ERR, EAL, "vfio received invalid message!\n"); + return -1; + } + + strcpy(reply.name, EAL_VFIO_MP); + reply.len_param = sizeof(*r); + + ret = rte_mp_reply(&reply, peer); + if (m->req == SOCKET_REQ_CONTAINER && fd >= 0) + close(fd); + return ret; +} + +int +vfio_mp_sync_setup(void) +{ + if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary); + if (ret && rte_errno != ENOTSUP) + return -1; + } + + return 0; +} + +#endif diff --git a/lib/librte_eal/linux/include/meson.build b/lib/librte_eal/linux/include/meson.build new file mode 100644 index 0000000000..1241894b3c --- /dev/null +++ b/lib/librte_eal/linux/include/meson.build @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_kni_common.h', + 'rte_os.h', +) diff --git a/lib/librte_eal/linux/include/rte_kni_common.h b/lib/librte_eal/linux/include/rte_kni_common.h new file mode 100644 index 0000000000..7313ef504e --- /dev/null +++ b/lib/librte_eal/linux/include/rte_kni_common.h @@ -0,0 +1,137 @@ +/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */ +/* + * Copyright(c) 2007-2014 Intel Corporation. + */ + +#ifndef _RTE_KNI_COMMON_H_ +#define _RTE_KNI_COMMON_H_ + +#ifdef __KERNEL__ +#include +#include +#define RTE_STD_C11 +#else +#include +#include +#endif + +/* + * KNI name is part of memzone name. Must not exceed IFNAMSIZ. + */ +#define RTE_KNI_NAMESIZE 16 + +#define RTE_CACHE_LINE_MIN_SIZE 64 + +/* + * Request id. + */ +enum rte_kni_req_id { + RTE_KNI_REQ_UNKNOWN = 0, + RTE_KNI_REQ_CHANGE_MTU, + RTE_KNI_REQ_CFG_NETWORK_IF, + RTE_KNI_REQ_CHANGE_MAC_ADDR, + RTE_KNI_REQ_CHANGE_PROMISC, + RTE_KNI_REQ_CHANGE_ALLMULTI, + RTE_KNI_REQ_MAX, +}; + +/* + * Structure for KNI request. + */ +struct rte_kni_request { + uint32_t req_id; /**< Request id */ + RTE_STD_C11 + union { + uint32_t new_mtu; /**< New MTU */ + uint8_t if_up; /**< 1: interface up, 0: interface down */ + uint8_t mac_addr[6]; /**< MAC address for interface */ + uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */ + uint8_t allmulti; /**< 1: all-multicast mode enable, 0: disable */ + }; + int32_t result; /**< Result for processing request */ +} __attribute__((__packed__)); + +/* + * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO + * Write and read should wrap around. Fifo is empty when write == read + * Writing should never overwrite the read position + */ +struct rte_kni_fifo { +#ifdef RTE_USE_C11_MEM_MODEL + unsigned write; /**< Next position to be written*/ + unsigned read; /**< Next position to be read */ +#else + volatile unsigned write; /**< Next position to be written*/ + volatile unsigned read; /**< Next position to be read */ +#endif + unsigned len; /**< Circular buffer length */ + unsigned elem_size; /**< Pointer size - for 32/64 bit OS */ + void *volatile buffer[]; /**< The buffer contains mbuf pointers */ +}; + +/* + * The kernel image of the rte_mbuf struct, with only the relevant fields. + * Padding is necessary to assure the offsets of these fields + */ +struct rte_kni_mbuf { + void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE))); + uint64_t buf_physaddr; + uint16_t data_off; /**< Start address of data in segment buffer. */ + char pad1[2]; + uint16_t nb_segs; /**< Number of segments. */ + char pad4[2]; + uint64_t ol_flags; /**< Offload features. */ + char pad2[4]; + uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */ + uint16_t data_len; /**< Amount of data in segment buffer. */ + + /* fields on second cache line */ + char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE))); + void *pool; + void *next; /**< Physical address of next mbuf in kernel. */ +}; + +/* + * Struct used to create a KNI device. Passed to the kernel in IOCTL call + */ + +struct rte_kni_device_info { + char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */ + + phys_addr_t tx_phys; + phys_addr_t rx_phys; + phys_addr_t alloc_phys; + phys_addr_t free_phys; + + /* Used by Ethtool */ + phys_addr_t req_phys; + phys_addr_t resp_phys; + phys_addr_t sync_phys; + void * sync_va; + + /* mbuf mempool */ + void * mbuf_va; + phys_addr_t mbuf_phys; + + uint16_t group_id; /**< Group ID */ + uint32_t core_id; /**< core ID to bind for kernel thread */ + + __extension__ + uint8_t force_bind : 1; /**< Flag for kernel thread binding */ + + /* mbuf size */ + unsigned mbuf_size; + unsigned int mtu; + unsigned int min_mtu; + unsigned int max_mtu; + uint8_t mac_addr[6]; + uint8_t iova_mode; +}; + +#define KNI_DEVICE "kni" + +#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int) +#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info) +#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info) + +#endif /* _RTE_KNI_COMMON_H_ */ diff --git a/lib/librte_eal/linux/include/rte_os.h b/lib/librte_eal/linux/include/rte_os.h new file mode 100644 index 0000000000..218d4fa86e --- /dev/null +++ b/lib/librte_eal/linux/include/rte_os.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * linux OS. Functions will be added in future releases. + */ + +#include + +typedef cpu_set_t rte_cpuset_t; +#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) +#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) +#define RTE_CPU_FILL(set) do \ +{ \ + unsigned int i; \ + CPU_ZERO(set); \ + for (i = 0; i < CPU_SETSIZE; i++) \ + CPU_SET(i, set); \ +} while (0) +#define RTE_CPU_NOT(dst, src) do \ +{ \ + cpu_set_t tmp; \ + RTE_CPU_FILL(&tmp); \ + CPU_XOR(dst, &tmp, src); \ +} while (0) + +#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/linux/meson.build b/lib/librte_eal/linux/meson.build new file mode 100644 index 0000000000..0e959272df --- /dev/null +++ b/lib/librte_eal/linux/meson.build @@ -0,0 +1,26 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2017 Intel Corporation + +subdir('include') + +sources += files('eal_alarm.c', + 'eal_cpuflags.c', + 'eal_debug.c', + 'eal_hugepage_info.c', + 'eal_interrupts.c', + 'eal_memalloc.c', + 'eal_lcore.c', + 'eal_log.c', + 'eal_thread.c', + 'eal_timer.c', + 'eal_vfio.c', + 'eal_vfio_mp_sync.c', + 'eal.c', + 'eal_memory.c', + 'eal_dev.c', +) + +deps += ['kvargs'] +if has_libnuma == 1 + dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true) +endif diff --git a/lib/librte_eal/meson.build b/lib/librte_eal/meson.build index 23ae03ad46..9d219a0e6e 100644 --- a/lib/librte_eal/meson.build +++ b/lib/librte_eal/meson.build @@ -1,18 +1,13 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2017-2019 Intel Corporation -# Custom EAL processing. EAL is complicated enough that it can't just -# have a straight list of headers and source files. -# Initially pull in common settings -eal_inc = [global_inc] +includes += global_inc subdir('include') subdir('common') -# Now do OS/exec-env specific settings, including building kernel modules -# The /eal/meson.build file should define env_sources, etc. dpdk_conf.set('RTE_EXEC_ENV_' + exec_env.to_upper(), 1) -subdir(exec_env + '/eal') +subdir(exec_env) subdir(arch_subdir) @@ -27,7 +22,3 @@ endif if cc.has_header('getopt.h') cflags += ['-DHAVE_GETOPT_H', '-DHAVE_GETOPT', '-DHAVE_GETOPT_LONG'] endif -sources += env_sources -objs = env_objs -headers += env_headers -includes += eal_inc diff --git a/lib/librte_eal/windows/eal.c b/lib/librte_eal/windows/eal.c new file mode 100644 index 0000000000..e4b50df3b7 --- /dev/null +++ b/lib/librte_eal/windows/eal.c @@ -0,0 +1,272 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + /* Allow the application to print its usage message too if set */ +static rte_usage_hook_t rte_application_usage_hook; + +/* define fd variable here, because file needs to be kept open for the + * duration of the program, as we hold a write lock on it in the primary proc + */ +static int mem_cfg_fd = -1; + +/* early configuration structure, when memory config is not mmapped */ +static struct rte_mem_config early_mem_config; + +/* Address of global and public configuration */ +static struct rte_config rte_config = { + .mem_config = &early_mem_config, +}; + +/* internal configuration (per-core) */ +struct lcore_config lcore_config[RTE_MAX_LCORE]; + +/* internal configuration */ +struct internal_config internal_config; + +/* platform-specific runtime dir */ +static char runtime_dir[PATH_MAX]; + +const char * +rte_eal_get_runtime_dir(void) +{ + return runtime_dir; +} + +/* Return a pointer to the configuration structure */ +struct rte_config * +rte_eal_get_configuration(void) +{ + return &rte_config; +} + +/* Detect if we are a primary or a secondary process */ +enum rte_proc_type_t +eal_proc_type_detect(void) +{ + enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; + const char *pathname = eal_runtime_config_path(); + + /* if we can open the file but not get a write-lock we are a secondary + * process. NOTE: if we get a file handle back, we keep that open + * and don't close it to prevent a race condition between multiple opens + */ + errno_t err = _sopen_s(&mem_cfg_fd, pathname, + _O_RDWR, _SH_DENYNO, _S_IREAD | _S_IWRITE); + if (err == 0) { + OVERLAPPED soverlapped = { 0 }; + soverlapped.Offset = sizeof(*rte_config.mem_config); + soverlapped.OffsetHigh = 0; + + HANDLE hwinfilehandle = (HANDLE)_get_osfhandle(mem_cfg_fd); + + if (!LockFileEx(hwinfilehandle, + LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, + sizeof(*rte_config.mem_config), 0, &soverlapped)) + ptype = RTE_PROC_SECONDARY; + } + + RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", + ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); + + return ptype; +} + +/* display usage */ +static void +eal_usage(const char *prgname) +{ + printf("\nUsage: %s ", prgname); + eal_common_usage(); + /* Allow the application to print its usage message too + * if hook is set + */ + if (rte_application_usage_hook) { + printf("===== Application Usage =====\n\n"); + rte_application_usage_hook(prgname); + } +} + +/* Parse the arguments for --log-level only */ +static void +eal_log_level_parse(int argc, char **argv) +{ + int opt; + char **argvopt; + int option_index; + + argvopt = argv; + + eal_reset_internal_config(&internal_config); + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') + break; + + ret = (opt == OPT_LOG_LEVEL_NUM) ? + eal_parse_common_option(opt, optarg, + &internal_config) : 0; + + /* common parser is not happy */ + if (ret < 0) + break; + } + + optind = 0; /* reset getopt lib */ +} + +/* Parse the argument given in the command line of the application */ +__attribute__((optnone)) static int +eal_parse_args(int argc, char **argv) +{ + int opt, ret; + char **argvopt; + int option_index; + char *prgname = argv[0]; + + argvopt = argv; + + while ((opt = getopt_long(argc, argvopt, eal_short_options, + eal_long_options, &option_index)) != EOF) { + + int ret; + + /* getopt is not happy, stop right now */ + if (opt == '?') { + eal_usage(prgname); + return -1; + } + + ret = eal_parse_common_option(opt, optarg, &internal_config); + /* common parser is not happy */ + if (ret < 0) { + eal_usage(prgname); + return -1; + } + /* common parser handled this option */ + if (ret == 0) + continue; + + switch (opt) { + case 'h': + eal_usage(prgname); + exit(EXIT_SUCCESS); + default: + if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { + RTE_LOG(ERR, EAL, "Option %c is not supported " + "on Windows\n", opt); + } else if (opt >= OPT_LONG_MIN_NUM && + opt < OPT_LONG_MAX_NUM) { + RTE_LOG(ERR, EAL, "Option %s is not supported " + "on Windows\n", + eal_long_options[option_index].name); + } else { + RTE_LOG(ERR, EAL, "Option %d is not supported " + "on Windows\n", opt); + } + eal_usage(prgname); + return -1; + } + } + + if (eal_adjust_config(&internal_config) != 0) + return -1; + + /* sanity checks */ + if (eal_check_common_options(&internal_config) != 0) { + eal_usage(prgname); + return -1; + } + + if (optind >= 0) + argv[optind - 1] = prgname; + ret = optind - 1; + optind = 0; /* reset getopt lib */ + return ret; +} + +static int +sync_func(void *arg __rte_unused) +{ + return 0; +} + +static void +rte_eal_init_alert(const char *msg) +{ + fprintf(stderr, "EAL: FATAL: %s\n", msg); + RTE_LOG(ERR, EAL, "%s\n", msg); +} + + /* Launch threads, called at application init(). */ +int +rte_eal_init(int argc, char **argv) +{ + int i, fctret; + + eal_log_level_parse(argc, argv); + + /* create a map of all processors in the system */ + eal_create_cpu_map(); + + if (rte_eal_cpu_init() < 0) { + rte_eal_init_alert("Cannot detect lcores."); + rte_errno = ENOTSUP; + return -1; + } + + fctret = eal_parse_args(argc, argv); + if (fctret < 0) + exit(1); + + eal_thread_init_master(rte_config.master_lcore); + + RTE_LCORE_FOREACH_SLAVE(i) { + + /* + * create communication pipes between master thread + * and children + */ + if (_pipe(lcore_config[i].pipe_master2slave, + sizeof(char), _O_BINARY) < 0) + rte_panic("Cannot create pipe\n"); + if (_pipe(lcore_config[i].pipe_slave2master, + sizeof(char), _O_BINARY) < 0) + rte_panic("Cannot create pipe\n"); + + lcore_config[i].state = WAIT; + + /* create a thread for each lcore */ + if (eal_thread_create(&lcore_config[i].thread_id) != 0) + rte_panic("Cannot create thread\n"); + } + + /* + * Launch a dummy function on all slave lcores, so that master lcore + * knows they are all ready when this function returns. + */ + rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); + rte_eal_mp_wait_lcore(); + return fctret; +} diff --git a/lib/librte_eal/windows/eal/eal.c b/lib/librte_eal/windows/eal/eal.c deleted file mode 100644 index e4b50df3b7..0000000000 --- a/lib/librte_eal/windows/eal/eal.c +++ /dev/null @@ -1,272 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - /* Allow the application to print its usage message too if set */ -static rte_usage_hook_t rte_application_usage_hook; - -/* define fd variable here, because file needs to be kept open for the - * duration of the program, as we hold a write lock on it in the primary proc - */ -static int mem_cfg_fd = -1; - -/* early configuration structure, when memory config is not mmapped */ -static struct rte_mem_config early_mem_config; - -/* Address of global and public configuration */ -static struct rte_config rte_config = { - .mem_config = &early_mem_config, -}; - -/* internal configuration (per-core) */ -struct lcore_config lcore_config[RTE_MAX_LCORE]; - -/* internal configuration */ -struct internal_config internal_config; - -/* platform-specific runtime dir */ -static char runtime_dir[PATH_MAX]; - -const char * -rte_eal_get_runtime_dir(void) -{ - return runtime_dir; -} - -/* Return a pointer to the configuration structure */ -struct rte_config * -rte_eal_get_configuration(void) -{ - return &rte_config; -} - -/* Detect if we are a primary or a secondary process */ -enum rte_proc_type_t -eal_proc_type_detect(void) -{ - enum rte_proc_type_t ptype = RTE_PROC_PRIMARY; - const char *pathname = eal_runtime_config_path(); - - /* if we can open the file but not get a write-lock we are a secondary - * process. NOTE: if we get a file handle back, we keep that open - * and don't close it to prevent a race condition between multiple opens - */ - errno_t err = _sopen_s(&mem_cfg_fd, pathname, - _O_RDWR, _SH_DENYNO, _S_IREAD | _S_IWRITE); - if (err == 0) { - OVERLAPPED soverlapped = { 0 }; - soverlapped.Offset = sizeof(*rte_config.mem_config); - soverlapped.OffsetHigh = 0; - - HANDLE hwinfilehandle = (HANDLE)_get_osfhandle(mem_cfg_fd); - - if (!LockFileEx(hwinfilehandle, - LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, - sizeof(*rte_config.mem_config), 0, &soverlapped)) - ptype = RTE_PROC_SECONDARY; - } - - RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n", - ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY"); - - return ptype; -} - -/* display usage */ -static void -eal_usage(const char *prgname) -{ - printf("\nUsage: %s ", prgname); - eal_common_usage(); - /* Allow the application to print its usage message too - * if hook is set - */ - if (rte_application_usage_hook) { - printf("===== Application Usage =====\n\n"); - rte_application_usage_hook(prgname); - } -} - -/* Parse the arguments for --log-level only */ -static void -eal_log_level_parse(int argc, char **argv) -{ - int opt; - char **argvopt; - int option_index; - - argvopt = argv; - - eal_reset_internal_config(&internal_config); - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - int ret; - - /* getopt is not happy, stop right now */ - if (opt == '?') - break; - - ret = (opt == OPT_LOG_LEVEL_NUM) ? - eal_parse_common_option(opt, optarg, - &internal_config) : 0; - - /* common parser is not happy */ - if (ret < 0) - break; - } - - optind = 0; /* reset getopt lib */ -} - -/* Parse the argument given in the command line of the application */ -__attribute__((optnone)) static int -eal_parse_args(int argc, char **argv) -{ - int opt, ret; - char **argvopt; - int option_index; - char *prgname = argv[0]; - - argvopt = argv; - - while ((opt = getopt_long(argc, argvopt, eal_short_options, - eal_long_options, &option_index)) != EOF) { - - int ret; - - /* getopt is not happy, stop right now */ - if (opt == '?') { - eal_usage(prgname); - return -1; - } - - ret = eal_parse_common_option(opt, optarg, &internal_config); - /* common parser is not happy */ - if (ret < 0) { - eal_usage(prgname); - return -1; - } - /* common parser handled this option */ - if (ret == 0) - continue; - - switch (opt) { - case 'h': - eal_usage(prgname); - exit(EXIT_SUCCESS); - default: - if (opt < OPT_LONG_MIN_NUM && isprint(opt)) { - RTE_LOG(ERR, EAL, "Option %c is not supported " - "on Windows\n", opt); - } else if (opt >= OPT_LONG_MIN_NUM && - opt < OPT_LONG_MAX_NUM) { - RTE_LOG(ERR, EAL, "Option %s is not supported " - "on Windows\n", - eal_long_options[option_index].name); - } else { - RTE_LOG(ERR, EAL, "Option %d is not supported " - "on Windows\n", opt); - } - eal_usage(prgname); - return -1; - } - } - - if (eal_adjust_config(&internal_config) != 0) - return -1; - - /* sanity checks */ - if (eal_check_common_options(&internal_config) != 0) { - eal_usage(prgname); - return -1; - } - - if (optind >= 0) - argv[optind - 1] = prgname; - ret = optind - 1; - optind = 0; /* reset getopt lib */ - return ret; -} - -static int -sync_func(void *arg __rte_unused) -{ - return 0; -} - -static void -rte_eal_init_alert(const char *msg) -{ - fprintf(stderr, "EAL: FATAL: %s\n", msg); - RTE_LOG(ERR, EAL, "%s\n", msg); -} - - /* Launch threads, called at application init(). */ -int -rte_eal_init(int argc, char **argv) -{ - int i, fctret; - - eal_log_level_parse(argc, argv); - - /* create a map of all processors in the system */ - eal_create_cpu_map(); - - if (rte_eal_cpu_init() < 0) { - rte_eal_init_alert("Cannot detect lcores."); - rte_errno = ENOTSUP; - return -1; - } - - fctret = eal_parse_args(argc, argv); - if (fctret < 0) - exit(1); - - eal_thread_init_master(rte_config.master_lcore); - - RTE_LCORE_FOREACH_SLAVE(i) { - - /* - * create communication pipes between master thread - * and children - */ - if (_pipe(lcore_config[i].pipe_master2slave, - sizeof(char), _O_BINARY) < 0) - rte_panic("Cannot create pipe\n"); - if (_pipe(lcore_config[i].pipe_slave2master, - sizeof(char), _O_BINARY) < 0) - rte_panic("Cannot create pipe\n"); - - lcore_config[i].state = WAIT; - - /* create a thread for each lcore */ - if (eal_thread_create(&lcore_config[i].thread_id) != 0) - rte_panic("Cannot create thread\n"); - } - - /* - * Launch a dummy function on all slave lcores, so that master lcore - * knows they are all ready when this function returns. - */ - rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER); - rte_eal_mp_wait_lcore(); - return fctret; -} diff --git a/lib/librte_eal/windows/eal/eal_debug.c b/lib/librte_eal/windows/eal/eal_debug.c deleted file mode 100644 index 669be6ff97..0000000000 --- a/lib/librte_eal/windows/eal/eal_debug.c +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#include -#include -#include - - /* call abort(), it will generate a coredump if enabled */ -void -__rte_panic(const char *funcname, const char *format, ...) -{ - va_list ap; - - rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); - va_start(ap, format); - rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); - va_end(ap); - abort(); -} diff --git a/lib/librte_eal/windows/eal/eal_lcore.c b/lib/librte_eal/windows/eal/eal_lcore.c deleted file mode 100644 index b3a6c63afa..0000000000 --- a/lib/librte_eal/windows/eal/eal_lcore.c +++ /dev/null @@ -1,103 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#include - -#include - -#include "eal_private.h" -#include "eal_thread.h" - -/* global data structure that contains the CPU map */ -static struct _wcpu_map { - unsigned int total_procs; - unsigned int proc_sockets; - unsigned int proc_cores; - unsigned int reserved; - struct _win_lcore_map { - uint8_t socket_id; - uint8_t core_id; - } wlcore_map[RTE_MAX_LCORE]; -} wcpu_map = { 0 }; - -/* - * Create a map of all processors and associated cores on the system - */ -void -eal_create_cpu_map() -{ - wcpu_map.total_procs = - GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); - - LOGICAL_PROCESSOR_RELATIONSHIP lprocRel; - DWORD lprocInfoSize = 0; - BOOL ht_enabled = FALSE; - - /* First get the processor package information */ - lprocRel = RelationProcessorPackage; - /* Determine the size of buffer we need (pass NULL) */ - GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); - wcpu_map.proc_sockets = lprocInfoSize / 48; - - lprocInfoSize = 0; - /* Next get the processor core information */ - lprocRel = RelationProcessorCore; - GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); - wcpu_map.proc_cores = lprocInfoSize / 48; - - if (wcpu_map.total_procs > wcpu_map.proc_cores) - ht_enabled = TRUE; - - /* Distribute the socket and core ids appropriately - * across the logical cores. For now, split the cores - * equally across the sockets. - */ - unsigned int lcore = 0; - for (unsigned int socket = 0; socket < - wcpu_map.proc_sockets; ++socket) { - for (unsigned int core = 0; - core < (wcpu_map.proc_cores / wcpu_map.proc_sockets); - ++core) { - wcpu_map.wlcore_map[lcore] - .socket_id = socket; - wcpu_map.wlcore_map[lcore] - .core_id = core; - lcore++; - if (ht_enabled) { - wcpu_map.wlcore_map[lcore] - .socket_id = socket; - wcpu_map.wlcore_map[lcore] - .core_id = core; - lcore++; - } - } - } -} - -/* - * Check if a cpu is present by the presence of the cpu information for it - */ -int -eal_cpu_detected(unsigned int lcore_id) -{ - return (lcore_id < wcpu_map.total_procs); -} - -/* - * Get CPU socket id for a logical core - */ -unsigned -eal_cpu_socket_id(unsigned int lcore_id) -{ - return wcpu_map.wlcore_map[lcore_id].socket_id; -} - -/* - * Get CPU socket id (NUMA node) for a logical core - */ -unsigned -eal_cpu_core_id(unsigned int lcore_id) -{ - return wcpu_map.wlcore_map[lcore_id].core_id; -} diff --git a/lib/librte_eal/windows/eal/eal_thread.c b/lib/librte_eal/windows/eal/eal_thread.c deleted file mode 100644 index 9e4bbaa082..0000000000 --- a/lib/librte_eal/windows/eal/eal_thread.c +++ /dev/null @@ -1,165 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "eal_private.h" - -RTE_DEFINE_PER_LCORE(unsigned int, _lcore_id) = LCORE_ID_ANY; -RTE_DEFINE_PER_LCORE(unsigned int, _socket_id) = (unsigned int)SOCKET_ID_ANY; -RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); - -/* - * Send a message to a slave lcore identified by slave_id to call a - * function f with argument arg. Once the execution is done, the - * remote lcore switch in FINISHED state. - */ -int -rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned int slave_id) -{ - int n; - char c = 0; - int m2s = lcore_config[slave_id].pipe_master2slave[1]; - int s2m = lcore_config[slave_id].pipe_slave2master[0]; - - if (lcore_config[slave_id].state != WAIT) - return -EBUSY; - - lcore_config[slave_id].f = f; - lcore_config[slave_id].arg = arg; - - /* send message */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = _write(m2s, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - /* wait ack */ - do { - n = _read(s2m, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - return 0; -} - -void -eal_thread_init_master(unsigned int lcore_id) -{ - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; -} - -static inline pthread_t -eal_thread_self(void) -{ - return GetCurrentThreadId(); -} - -/* main loop of threads */ -void * -eal_thread_loop(void *arg __rte_unused) -{ - char c; - int n, ret; - unsigned int lcore_id; - pthread_t thread_id; - int m2s, s2m; - char cpuset[RTE_CPU_AFFINITY_STR_LEN]; - - thread_id = eal_thread_self(); - - /* retrieve our lcore_id from the configuration structure */ - RTE_LCORE_FOREACH_SLAVE(lcore_id) { - if (thread_id == lcore_config[lcore_id].thread_id) - break; - } - if (lcore_id == RTE_MAX_LCORE) - rte_panic("cannot retrieve lcore id\n"); - - m2s = lcore_config[lcore_id].pipe_master2slave[0]; - s2m = lcore_config[lcore_id].pipe_slave2master[1]; - - /* set the lcore ID in per-lcore memory area */ - RTE_PER_LCORE(_lcore_id) = lcore_id; - - RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s])\n", - lcore_id, (uintptr_t)thread_id, cpuset); - - /* read on our pipe to get commands */ - while (1) { - void *fct_arg; - - /* wait command */ - do { - n = _read(m2s, &c, 1); - } while (n < 0 && errno == EINTR); - - if (n <= 0) - rte_panic("cannot read on configuration pipe\n"); - - lcore_config[lcore_id].state = RUNNING; - - /* send ack */ - n = 0; - while (n == 0 || (n < 0 && errno == EINTR)) - n = _write(s2m, &c, 1); - if (n < 0) - rte_panic("cannot write on configuration pipe\n"); - - if (lcore_config[lcore_id].f == NULL) - rte_panic("NULL function pointer\n"); - - /* call the function and store the return value */ - fct_arg = lcore_config[lcore_id].arg; - ret = lcore_config[lcore_id].f(fct_arg); - lcore_config[lcore_id].ret = ret; - rte_wmb(); - - /* when a service core returns, it should go directly to WAIT - * state, because the application will not lcore_wait() for it. - */ - if (lcore_config[lcore_id].core_role == ROLE_SERVICE) - lcore_config[lcore_id].state = WAIT; - else - lcore_config[lcore_id].state = FINISHED; - } -} - -/* function to create threads */ -int -eal_thread_create(pthread_t *thread) -{ - HANDLE th; - - th = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)eal_thread_loop, - NULL, 0, (LPDWORD)thread); - if (!th) - return -1; - - SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); - SetThreadPriority(th, THREAD_PRIORITY_TIME_CRITICAL); - - return 0; -} - -int -rte_thread_setname(__rte_unused pthread_t id, __rte_unused const char *name) -{ - /* TODO */ - /* This is a stub, not the expected result */ - return 0; -} diff --git a/lib/librte_eal/windows/eal/getopt.c b/lib/librte_eal/windows/eal/getopt.c deleted file mode 100644 index 170c9b5e0c..0000000000 --- a/lib/librte_eal/windows/eal/getopt.c +++ /dev/null @@ -1,470 +0,0 @@ -/* SPDX-License-Identifier: ISC AND BSD-2-Clause - * Copyright (c) 2002 Todd C. Miller - * - * Sponsored in part by the Defense Advanced Research Projects - * Agency (DARPA) and Air Force Research Laboratory, Air Force - * Materiel Command, USAF, under agreement number F39502-99-1-0512. - */ -/* - * Copyright (c) 2000 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Dieter Baron and Thomas Klausner. - */ - -#include - -#ifdef NEED_USUAL_GETOPT - -#include -#include - -const char *optarg; /* argument associated with option */ -int opterr = 1; /* if error message should be printed */ -int optind = 1; /* index into parent argv vector */ -int optopt = '?'; /* character checked for validity */ - -static void pass(void) {} -#define warnx(a, ...) pass() - -#define PRINT_ERROR ((opterr) && (*options != ':')) - -#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ -#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ -#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ - -/* return values */ -#define BADCH ((int)'?') -#define BADARG ((*options == ':') ? (int)':' : (int)'?') -#define INORDER 1 - -#define EMSG "" - -static const char *place = EMSG; /* option letter processing */ - -/* XXX: set optreset to 1 rather than these two */ -static int nonopt_start = -1; /* first non option argument (for permute) */ -static int nonopt_end = -1; /* first option after non options (for permute) */ - -/* Error messages */ -static const char recargchar[] = "option requires an argument -- %c"; -static const char recargstring[] = "option requires an argument -- %s"; -static const char ambig[] = "ambiguous option -- %.*s"; -static const char noarg[] = "option doesn't take an argument -- %.*s"; -static const char illoptchar[] = "unknown option -- %c"; -static const char illoptstring[] = "unknown option -- %s"; - -/* - * Compute the greatest common divisor of a and b. - */ -static int -gcd(int a, int b) -{ - int c; - - c = a % b; - while (c != 0) { - a = b; - b = c; - c = a % b; - } - - return (b); -} - -/* - * Exchange the block from nonopt_start to nonopt_end with the block - * from nonopt_end to opt_end (keeping the same order of arguments - * in each block). - */ -static void -permute_args(int panonopt_start, int panonopt_end, int opt_end, - char **nargv) -{ - int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; - char *swap; - - /* - * compute lengths of blocks and number and size of cycles - */ - nnonopts = panonopt_end - panonopt_start; - nopts = opt_end - panonopt_end; - ncycle = gcd(nnonopts, nopts); - cyclelen = (opt_end - panonopt_start) / ncycle; - - for (i = 0; i < ncycle; i++) { - cstart = panonopt_end+i; - pos = cstart; - for (j = 0; j < cyclelen; j++) { - if (pos >= panonopt_end) - pos -= nnonopts; - else - pos += nopts; - swap = nargv[pos]; - /* LINTED const cast */ - ((char **) nargv)[pos] = nargv[cstart]; - /* LINTED const cast */ - ((char **)nargv)[cstart] = swap; - } - } -} - -/* - * parse_long_options -- - * Parse long options in argc/argv argument vector. - * Returns -1 if short_too is set and the option does not match long_options. - */ -static int -parse_long_options(char **nargv, const char *options, - const struct option *long_options, int *idx, int short_too) -{ - const char *current_argv; - char *has_equal; - size_t current_argv_len; - int i, match; - - current_argv = place; - match = -1; - - optind++; - - has_equal = strchr(current_argv, '='); - if (has_equal != NULL) { - /* argument found (--option=arg) */ - current_argv_len = has_equal - current_argv; - has_equal++; - } else - current_argv_len = strlen(current_argv); - - for (i = 0; long_options[i].name; i++) { - /* find matching long option */ - if (strncmp(current_argv, long_options[i].name, - current_argv_len)) - continue; - - if (strlen(long_options[i].name) == current_argv_len) { - /* exact match */ - match = i; - break; - } - /* - * If this is a known short option, don't allow - * a partial match of a single character. - */ - if (short_too && current_argv_len == 1) - continue; - - if (match == -1) /* partial match */ - match = i; - else { - /* ambiguous abbreviation */ - if (PRINT_ERROR) - warnx(ambig, (int)current_argv_len, - current_argv); - optopt = 0; - return BADCH; - } - } - if (match != -1) { /* option found */ - if (long_options[match].has_arg == no_argument - && has_equal) { - if (PRINT_ERROR) - warnx(noarg, (int)current_argv_len, - current_argv); - /* - * XXX: GNU sets optopt to val regardless of flag - */ - if (long_options[match].flag == NULL) - optopt = long_options[match].val; - else - optopt = 0; - return BADARG; - } - if (long_options[match].has_arg == required_argument || - long_options[match].has_arg == optional_argument) { - if (has_equal) - optarg = has_equal; - else if (long_options[match].has_arg == - required_argument) { - /* - * optional argument doesn't use next nargv - */ - optarg = nargv[optind++]; - } - } - if ((long_options[match].has_arg == required_argument) - && (optarg == NULL)) { - /* - * Missing argument; leading ':' indicates no error - * should be generated. - */ - if (PRINT_ERROR) - warnx(recargstring, - current_argv); - /* - * XXX: GNU sets optopt to val regardless of flag - */ - if (long_options[match].flag == NULL) - optopt = long_options[match].val; - else - optopt = 0; - --optind; - return BADARG; - } - } else { /* unknown option */ - if (short_too) { - --optind; - return (-1); - } - if (PRINT_ERROR) - warnx(illoptstring, current_argv); - optopt = 0; - return BADCH; - } - if (idx) - *idx = match; - if (long_options[match].flag) { - *long_options[match].flag = long_options[match].val; - return 0; - } else - return (long_options[match].val); -} - -/* - * getopt_internal -- - * Parse argc/argv argument vector. Called by user level routines. - */ -static int -getopt_internal(int nargc, char **nargv, const char *options, - const struct option *long_options, int *idx, int flags) -{ - char *oli; /* option letter list index */ - int optchar, short_too; - static int posixly_correct = -1; - char *buf; - size_t len; - int optreset = 0; - - if (options == NULL) - return (-1); - - /* - * Disable GNU extensions if POSIXLY_CORRECT is set or options - * string begins with a '+'. - */ - if (posixly_correct == -1) - posixly_correct = _dupenv_s(&buf, &len, "POSIXLY_CORRECT"); - if (!posixly_correct || *options == '+') - flags &= ~FLAG_PERMUTE; - else if (*options == '-') - flags |= FLAG_ALLARGS; - if (*options == '+' || *options == '-') - options++; - if (!posixly_correct) - free(buf); - /* - * reset if requested - */ - if (optind == 0) - optind = optreset = 1; - - optarg = NULL; - if (optreset) - nonopt_start = nonopt_end = -1; -start: - if (optreset || !*place) { /* update scanning pointer */ - optreset = 0; - if (optind >= nargc) { /* end of argument vector */ - place = EMSG; - if (nonopt_end != -1) { - /* do permutation, if we have to */ - permute_args(nonopt_start, nonopt_end, - optind, nargv); - optind -= nonopt_end - nonopt_start; - } else if (nonopt_start != -1) { - /* - * If we skipped non-options, set optind - * to the first of them. - */ - optind = nonopt_start; - } - nonopt_start = nonopt_end = -1; - return (-1); - } - place = nargv[optind]; - if (*place != '-' || - (place[1] == '\0' && strchr(options, '-') == NULL)) { - place = EMSG; /* found non-option */ - if (flags & FLAG_ALLARGS) { - /* - * GNU extension: - * return non-option as argument to option 1 - */ - optarg = nargv[optind++]; - return INORDER; - } - if (!(flags & FLAG_PERMUTE)) { - /* - * If no permutation wanted, stop parsing - * at first non-option. - */ - return (-1); - } - /* do permutation */ - if (nonopt_start == -1) - nonopt_start = optind; - else if (nonopt_end != -1) { - permute_args(nonopt_start, nonopt_end, - optind, nargv); - nonopt_start = optind - - (nonopt_end - nonopt_start); - nonopt_end = -1; - } - optind++; - /* process next argument */ - goto start; - } - if (nonopt_start != -1 && nonopt_end == -1) - nonopt_end = optind; - - /* - * If we have "-" do nothing, if "--" we are done. - */ - if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { - optind++; - place = EMSG; - /* - * We found an option (--), so if we skipped - * non-options, we have to permute. - */ - if (nonopt_end != -1) { - permute_args(nonopt_start, nonopt_end, - optind, nargv); - optind -= nonopt_end - nonopt_start; - } - nonopt_start = nonopt_end = -1; - return (-1); - } - } - - /* - * Check long options if: - * 1) we were passed some - * 2) the arg is not just "-" - * 3) either the arg starts with -- we are getopt_long_only() - */ - if (long_options != NULL && place != nargv[optind] && - (*place == '-' || (flags & FLAG_LONGONLY))) { - short_too = 0; - if (*place == '-') - place++; /* --foo long option */ - else if (*place != ':' && strchr(options, *place) != NULL) - short_too = 1; /* could be short option too */ - - optchar = parse_long_options(nargv, options, long_options, - idx, short_too); - if (optchar != -1) { - place = EMSG; - return optchar; - } - } - - optchar = (int)*place++; - oli = strchr(options, optchar); - if (optchar == (int)':' || - (optchar == (int)'-' && *place != '\0') || - oli == NULL) { - /* - * If the user specified "-" and '-' isn't listed in - * options, return -1 (non-option) as per POSIX. - * Otherwise, it is an unknown option character (or ':'). - */ - if (optchar == (int)'-' && *place == '\0') - return (-1); - if (!*place) - ++optind; - if (PRINT_ERROR) - warnx(illoptchar, optchar); - optopt = optchar; - return BADCH; - } - if (long_options != NULL && optchar == 'W' && oli[1] == ';') { - /* -W long-option */ - if (*place) - ; - else if (++optind >= nargc) { /* no arg */ - place = EMSG; - if (PRINT_ERROR) - warnx(recargchar, optchar); - optopt = optchar; - return BADARG; - } /* white space */ - place = nargv[optind]; - optchar = parse_long_options(nargv, options, long_options, - idx, 0); - place = EMSG; - return optchar; - } - if (*++oli != ':') { /* doesn't take argument */ - if (!*place) - ++optind; - } else { /* takes (optional) argument */ - optarg = NULL; - if (*place) /* no white space */ - optarg = place; - else if (oli[1] != ':') { /* arg not optional */ - if (++optind >= nargc) { /* no arg */ - place = EMSG; - if (PRINT_ERROR) - warnx(recargchar, optchar); - optopt = optchar; - return BADARG; - } - optarg = nargv[optind]; - } - place = EMSG; - ++optind; - } - /* dump back option letter */ - return optchar; -} - -/* - * getopt -- - * Parse argc/argv argument vector. - */ -int -getopt(int nargc, char *nargv[], const char *options) -{ - return getopt_internal(nargc, nargv, options, NULL, NULL, - FLAG_PERMUTE); -} - -/* - * getopt_long -- - * Parse argc/argv argument vector. - */ -int -getopt_long(int nargc, char *nargv[], const char *options, - const struct option *long_options, int *idx) -{ - - return (getopt_internal(nargc, nargv, options, long_options, idx, - FLAG_PERMUTE)); -} - -/* - * getopt_long_only -- - * Parse argc/argv argument vector. - */ -int -getopt_long_only(int nargc, char *nargv[], const char *options, - const struct option *long_options, int *idx) -{ - - return (getopt_internal(nargc, nargv, options, long_options, idx, - FLAG_PERMUTE|FLAG_LONGONLY)); -} - -#endif /* NEED_USUAL_GETOPT */ diff --git a/lib/librte_eal/windows/eal/include/dirent.h b/lib/librte_eal/windows/eal/include/dirent.h deleted file mode 100644 index 869a598378..0000000000 --- a/lib/librte_eal/windows/eal/include/dirent.h +++ /dev/null @@ -1,664 +0,0 @@ -/* SPDX-License-Identifier: MIT - * Dirent interface for Microsoft Visual Studio - * Version 1.21 - * Copyright (C) 2006-2012 Toni Ronkko - * https://github.com/tronkko/dirent - */ - -#ifndef DIRENT_H -#define DIRENT_H - -/* - * Include windows.h without Windows Sockets 1.1 to prevent conflicts with - * Windows Sockets 2.0. - */ -#ifndef WIN32_LEAN_AND_MEAN -# define WIN32_LEAN_AND_MEAN -#endif - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Maximum length of file name */ -#if !defined(PATH_MAX) -# define PATH_MAX MAX_PATH -#endif - -/* File type flags for d_type */ -#define DT_UNKNOWN 0 -#define DT_REG S_IFREG -#define DT_DIR S_IFDIR -#define DT_CHR S_IFCHR - -/* - * File type macros. Note that block devices, sockets and links cannot be - * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are - * only defined for compatibility. These macros should always return false - * on Windows. - */ -#if !defined(S_ISDIR) -# define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) -#endif -#if !defined(S_ISREG) -# define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) -#endif - -/* Wide-character version */ -struct _wdirent { - /* Always zero */ - long d_ino; - - /* Structure size */ - unsigned short d_reclen; - - /* Length of name without \0 */ - size_t d_namlen; - - /* File type */ - int d_type; - - /* File name */ - wchar_t d_name[PATH_MAX]; -}; -typedef struct _wdirent _wdirent; - -struct _WDIR { - /* Current directory entry */ - struct _wdirent ent; - - /* Private file data */ - WIN32_FIND_DATAW data; - - /* True if data is valid */ - int cached; - - /* Win32 search handle */ - HANDLE handle; - - /* Initial directory name */ - wchar_t *patt; -}; -typedef struct _WDIR _WDIR; - -static _WDIR *_wopendir(const wchar_t *dirname); -static int _wclosedir(_WDIR *dirp); - -/* For compatibility with Symbian */ -#define wdirent _wdirent -#define WDIR _WDIR -#define wopendir _wopendir -#define wclosedir _wclosedir - -/* Multi-byte character versions */ -struct dirent { - /* Always zero */ - long d_ino; - - /* Structure size */ - unsigned short d_reclen; - - /* Length of name without \0 */ - size_t d_namlen; - - /* File type */ - int d_type; - - /* File name */ - char d_name[PATH_MAX]; -}; -typedef struct dirent dirent; - -struct DIR { - struct dirent ent; - struct _WDIR *wdirp; -}; -typedef struct DIR DIR; - -static DIR *opendir(const char *dirname); -static struct dirent *readdir(DIR *dirp); -static int closedir(DIR *dirp); - -/* Internal utility functions */ -static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp); -static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp); - -static int dirent_mbstowcs_s( - size_t *pReturnValue, - wchar_t *wcstr, - size_t sizeInWords, - const char *mbstr, - size_t count); - -static int dirent_wcstombs_s( - size_t *pReturnValue, - char *mbstr, - size_t sizeInBytes, - const wchar_t *wcstr, - size_t count); - -static void dirent_set_errno(int error); - -/* - * Open directory stream DIRNAME for read and return a pointer to the - * internal working area that is used to retrieve individual directory - * entries. - */ -static _WDIR* -_wopendir(const wchar_t *dirname) -{ - _WDIR *dirp = NULL; - int error; - - /* Must have directory name */ - if (dirname == NULL || dirname[0] == '\0') { - dirent_set_errno(ENOENT); - return NULL; - } - - /* Allocate new _WDIR structure */ - dirp = (_WDIR *)malloc(sizeof(struct _WDIR)); - if (dirp != NULL) { - DWORD n; - - /* Reset _WDIR structure */ - dirp->handle = INVALID_HANDLE_VALUE; - dirp->patt = NULL; - dirp->cached = 0; - - /* Compute the length of full path plus zero terminator - * - * Note that on WinRT there's no way to convert relative paths - * into absolute paths, so just assume its an absolute path. - */ - #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) - n = wcslen(dirname); - #else - n = GetFullPathNameW(dirname, 0, NULL, NULL); - #endif - - /* Allocate room for absolute directory name and search - * pattern - */ - dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16); - if (dirp->patt) { - /* Convert relative directory name to an - * absolute one. This allows rewinddir() to - * function correctly even when current working - * directory is changed between opendir() - * and rewinddir(). - * - * Note that on WinRT there's no way to convert - * relative paths into absolute paths, so just - * assume its an absolute path. - */ - #if defined(WINAPI_FAMILY) && \ - (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) - wcsncpy_s(dirp->patt, n + 1, dirname, n); - #else - n = GetFullPathNameW(dirname, n, dirp->patt, NULL); - #endif - if (n > 0) { - wchar_t *p; - - /* Append search pattern \* to the directory - * name - */ - p = dirp->patt + n; - if (dirp->patt < p) { - switch (p[-1]) { - case '\\': - case '/': - case ':': - /* Directory ends in path separator, - * e.g.c:\temp\ - */ - /*NOP*/; - break; - - default: - /* Directory name doesn't end in path - * separator - */ - *p++ = '\\'; - } - } - *p++ = '*'; - *p = '\0'; - - /* Open directory stream and retrieve the first - * entry - */ - if (dirent_first(dirp)) { - /* Directory stream opened successfully */ - error = 0; - } else { - /* Cannot retrieve first entry */ - error = 1; - dirent_set_errno(ENOENT); - } - - } else { - /* Cannot retrieve full path name */ - dirent_set_errno(ENOENT); - error = 1; - } - - } else { - /* Cannot allocate memory for search pattern */ - error = 1; - } - - } else { - /* Cannot allocate _WDIR structure */ - error = 1; - } - - /* Clean up in case of error */ - if (error && dirp) { - _wclosedir(dirp); - dirp = NULL; - } - - return dirp; -} - -/* - * Close directory stream opened by opendir() function. - * This invalidates the DIR structure as well as any directory - * entry read previously by _wreaddir(). - */ -static int -_wclosedir(_WDIR *dirp) -{ - int ok; - if (dirp) { - - /* Release search handle */ - if (dirp->handle != INVALID_HANDLE_VALUE) { - FindClose(dirp->handle); - dirp->handle = INVALID_HANDLE_VALUE; - } - - /* Release search pattern */ - if (dirp->patt) { - free(dirp->patt); - dirp->patt = NULL; - } - - /* Release directory structure */ - free(dirp); - ok = /*success*/0; - - } else { - /* Invalid directory stream */ - dirent_set_errno(EBADF); - ok = /*failure*/-1; - } - return ok; -} - -/* Get first directory entry (internal) */ -static WIN32_FIND_DATAW* -dirent_first(_WDIR *dirp) -{ - WIN32_FIND_DATAW *datap; - - /* Open directory and retrieve the first entry */ - dirp->handle = FindFirstFileExW( - dirp->patt, FindExInfoStandard, &dirp->data, - FindExSearchNameMatch, NULL, 0); - if (dirp->handle != INVALID_HANDLE_VALUE) { - - /* a directory entry is now waiting in memory */ - datap = &dirp->data; - dirp->cached = 1; - - } else { - - /* Failed to re-open directory: no directory entry in memory */ - dirp->cached = 0; - datap = NULL; - - } - return datap; -} - -/* Get next directory entry (internal) */ -static WIN32_FIND_DATAW* -dirent_next(_WDIR *dirp) -{ - WIN32_FIND_DATAW *p; - - /* Get next directory entry */ - if (dirp->cached != 0) { - - /* A valid directory entry already in memory */ - p = &dirp->data; - dirp->cached = 0; - - } else if (dirp->handle != INVALID_HANDLE_VALUE) { - - /* Get the next directory entry from stream */ - if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) { - /* Got a file */ - p = &dirp->data; - } else { - /* The very last entry has been processed - *or an error occurred - */ - FindClose(dirp->handle); - dirp->handle = INVALID_HANDLE_VALUE; - p = NULL; - } - - } else { - - /* End of directory stream reached */ - p = NULL; - - } - - return p; -} - -/* - * Open directory stream using plain old C-string. - */ -static DIR* -opendir(const char *dirname) -{ - struct DIR *dirp; - int error; - - /* Must have directory name */ - if (dirname == NULL || dirname[0] == '\0') { - dirent_set_errno(ENOENT); - return NULL; - } - - /* Allocate memory for DIR structure */ - dirp = (DIR *)malloc(sizeof(struct DIR)); - if (dirp) { - wchar_t wname[PATH_MAX]; - size_t n; - - /* Convert directory name to wide-character string */ - error = dirent_mbstowcs_s(&n, wname, PATH_MAX, - dirname, PATH_MAX); - if (!error) { - - /* Open directory stream using wide-character name */ - dirp->wdirp = _wopendir(wname); - if (dirp->wdirp) { - /* Directory stream opened */ - error = 0; - } else { - /* Failed to open directory stream */ - error = 1; - } - - } else { - /* - * Cannot convert file name to wide-character string. - * This occurs if the string contains invalid multi-byte - * sequences or the output buffer is too small to - * contain the resulting string. - */ - error = 1; - } - - } else { - /* Cannot allocate DIR structure */ - error = 1; - } - - /* Clean up in case of error */ - if (error && dirp) { - free(dirp); - dirp = NULL; - } - - return dirp; -} - -/* - * Read next directory entry. - * - * When working with text consoles, please note that file names - * returned by readdir() are represented in the default ANSI code - * page while any output toconsole is typically formatted on another - * code page. Thus, non-ASCII characters in file names will not usually - * display correctly on console. The problem can be fixed in two ways: - * (1) change the character set of console to 1252 using chcp utility - * and use Lucida Console font, or (2) use _cprintf function when - * writing to console. The _cprinf() will re-encode ANSI strings to the - * console code page so many non-ASCII characters will display correctly. - */ -static struct dirent* -readdir(DIR *dirp) -{ - WIN32_FIND_DATAW *datap; - struct dirent *entp; - - /* Read next directory entry */ - datap = dirent_next(dirp->wdirp); - if (datap) { - size_t n; - int error; - - /* Attempt to convert file name to multi-byte string */ - error = dirent_wcstombs_s(&n, dirp->ent.d_name, - PATH_MAX, datap->cFileName, PATH_MAX); - - /* - * If the file name cannot be represented by a multi-byte - * string, then attempt to use old 8+3 file name. - * This allows traditional Unix-code to access some file - * names despite of unicode characters, although file names - * may seem unfamiliar to the user. - * - * Be ware that the code below cannot come up with a short - * file name unless the file system provides one. At least - * VirtualBox shared folders fail to do this. - */ - if (error && datap->cAlternateFileName[0] != '\0') { - error = dirent_wcstombs_s( - &n, dirp->ent.d_name, PATH_MAX, - datap->cAlternateFileName, PATH_MAX); - } - - if (!error) { - DWORD attr; - - /* Initialize directory entry for return */ - entp = &dirp->ent; - - /* Length of file name excluding zero terminator */ - entp->d_namlen = n - 1; - - /* File attributes */ - attr = datap->dwFileAttributes; - if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) - entp->d_type = DT_CHR; - else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) - entp->d_type = DT_DIR; - else - entp->d_type = DT_REG; - - /* Reset dummy fields */ - entp->d_ino = 0; - entp->d_reclen = sizeof(struct dirent); - - } else { - /* - * Cannot convert file name to multi-byte string so - * construct an erroneous directory entry and return - * that. Note that we cannot return NULL as that would - * stop the processing of directory entries completely. - */ - entp = &dirp->ent; - entp->d_name[0] = '?'; - entp->d_name[1] = '\0'; - entp->d_namlen = 1; - entp->d_type = DT_UNKNOWN; - entp->d_ino = 0; - entp->d_reclen = 0; - } - - } else { - /* No more directory entries */ - entp = NULL; - } - - return entp; -} - -/* - * Close directory stream. - */ -static int -closedir(DIR *dirp) -{ - int ok; - if (dirp) { - - /* Close wide-character directory stream */ - ok = _wclosedir(dirp->wdirp); - dirp->wdirp = NULL; - - /* Release multi-byte character version */ - free(dirp); - - } else { - - /* Invalid directory stream */ - dirent_set_errno(EBADF); - ok = /*failure*/-1; - - } - return ok; -} - -/* Convert multi-byte string to wide character string */ -static int -dirent_mbstowcs_s( - size_t *pReturnValue, - wchar_t *wcstr, - size_t sizeInWords, - const char *mbstr, - size_t count) -{ - int error; - - #if defined(_MSC_VER) && _MSC_VER >= 1400 - /* Microsoft Visual Studio 2005 or later */ - error = mbstowcs_s(pReturnValue, wcstr, - sizeInWords, mbstr, count); - #else - - /* Older Visual Studio or non-Microsoft compiler */ - size_t n; - - /* Convert to wide-character string (or count characters) */ - n = mbstowcs(wcstr, mbstr, sizeInWords); - if (!wcstr || n < count) { - - /* Zero-terminate output buffer */ - if (wcstr && sizeInWords) { - if (n >= sizeInWords) - n = sizeInWords - 1; - wcstr[n] = 0; - } - - /* Length of resuting multi-byte string WITH zero - *terminator - */ - if (pReturnValue) - *pReturnValue = n + 1; - - /* Success */ - error = 0; - - } else { - - /* Could not convert string */ - error = 1; - - } - #endif - - return error; -} - -/* Convert wide-character string to multi-byte string */ -static int -dirent_wcstombs_s( - size_t *pReturnValue, - char *mbstr, - size_t sizeInBytes, /* max size of mbstr */ - const wchar_t *wcstr, - size_t count) -{ - int error; - - #if defined(_MSC_VER) && _MSC_VER >= 1400 - /* Microsoft Visual Studio 2005 or later */ - error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count); - #else - /* Older Visual Studio or non-Microsoft compiler */ - size_t n; - - /* Convert to multi-byte string - * (or count the number of bytes needed) - */ - n = wcstombs(mbstr, wcstr, sizeInBytes); - if (!mbstr || n < count) { - /* Zero-terminate output buffer */ - if (mbstr && sizeInBytes) { - if (n >= sizeInBytes) - n = sizeInBytes - 1; - mbstr[n] = '\0'; - } - /* Length of resulting multi-bytes string WITH - *zero-terminator - */ - if (pReturnValue) - *pReturnValue = n + 1; - /* Success */ - error = 0; - } else { - /* Cannot convert string */ - error = 1; - } - #endif - - return error; -} - -/* Set errno variable */ -static void -dirent_set_errno(int error) -{ -#if defined(_MSC_VER) && _MSC_VER >= 1400 - /* Microsoft Visual Studio 2005 and later */ - _set_errno(error); -#else - - /* Non-Microsoft compiler or older Microsoft compiler */ - errno = error; -#endif -} - -#ifdef __cplusplus -} -#endif -#endif /*DIRENT_H*/ diff --git a/lib/librte_eal/windows/eal/include/fnmatch.h b/lib/librte_eal/windows/eal/include/fnmatch.h deleted file mode 100644 index 41b574312c..0000000000 --- a/lib/librte_eal/windows/eal/include/fnmatch.h +++ /dev/null @@ -1,48 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#ifndef _FNMATCH_H_ -#define _FNMATCH_H_ - -/** - * This file is required to support the common code in eal_common_log.c - * as Microsoft libc does not contain fnmatch.h. This may be removed in - * future releases. - */ -#ifdef __cplusplus -extern "C" { -#endif - -#define FNM_NOMATCH 1 - -/** - * This function is used for searhing a given string source - * with the given regular expression pattern. - * - * @param pattern - * regular expression notation decribing the pattern to match - * - * @param string - * source string to searcg for the pattern - * - * @param flag - * containing information about the pattern - * - * @return - * if the pattern is found then return 0 or else FNM_NOMATCH - */ -static inline int fnmatch(__rte_unused const char *pattern, - __rte_unused const char *string, - __rte_unused int flags) -{ - /* TODO */ - /* This is a stub, not the expected result */ - return FNM_NOMATCH; -} - -#ifdef __cplusplus -} -#endif - -#endif /* _FNMATCH_H_ */ diff --git a/lib/librte_eal/windows/eal/include/getopt.h b/lib/librte_eal/windows/eal/include/getopt.h deleted file mode 100644 index 6f57af454b..0000000000 --- a/lib/librte_eal/windows/eal/include/getopt.h +++ /dev/null @@ -1,96 +0,0 @@ -/* SPDX-License-Identifier: BSD-2-Clause - * Copyright (c) 2000 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Dieter Baron and Thomas Klausner. - */ - -/** - * @file - * getopt compat. - * - * This module provides getopt() and getopt_long(). - */ - -#ifndef _USUAL_GETOPT_H_ -#define _USUAL_GETOPT_H_ - -#ifndef NEED_USUAL_GETOPT -#if !defined(HAVE_GETOPT_H) || !defined(HAVE_GETOPT) || \ - !defined(HAVE_GETOPT_LONG) -#define NEED_USUAL_GETOPT -#endif -#endif - -#ifndef NEED_USUAL_GETOPT - -/* Use system getopt */ -#ifdef RTE_TOOLCHAIN_GCC -#include_next -#else -#include -#endif - -#else /* NEED_USUAL_GETOPT */ - -/* avoid name collision */ -#define optarg usual_optarg -#define opterr usual_opterr -#define optind usual_optind -#define optopt usual_optopt -#define getopt(a, b, c) usual_getopt(a, b, c) -#define getopt_long(a, b, c, d, e) usual_getopt_long(a, b, c, d, e) - - -/** argument to current option, or NULL if it has none */ -extern const char *optarg; -/** Current position in arg string. Starts from 1. - * Setting to 0 resets state. - */ -extern int optind; -/** whether getopt() should print error messages on problems. Default: 1. */ -extern int opterr; -/** Option char which caused error */ -extern int optopt; - -/** long option takes no argument */ -#define no_argument 0 -/** long option requires argument */ -#define required_argument 1 -/** long option has optional argument */ -#define optional_argument 2 - -/** Long option description */ -struct option { - /** name of long option */ - const char *name; - - /** - * whether option takes an argument. - * One of no_argument, required_argument, and optional_argument. - */ - int has_arg; - - /** if not NULL, set *flag to val when option found */ - int *flag; - - /** if flag not NULL, value to set *flag to; else return value */ - int val; -}; - -/** Compat: getopt */ -int getopt(int argc, char *argv[], const char *options); - -/** Compat: getopt_long */ -int getopt_long(int argc, char *argv[], const char *options, - const struct option *longopts, int *longindex); - -/** Compat: getopt_long_only */ -int getopt_long_only(int nargc, char *argv[], const char *options, - const struct option *long_options, int *idx); - - -#endif /* NEED_USUAL_GETOPT */ - -#endif /* !_USUAL_GETOPT_H_ */ diff --git a/lib/librte_eal/windows/eal/include/pthread.h b/lib/librte_eal/windows/eal/include/pthread.h deleted file mode 100644 index b9dd18e568..0000000000 --- a/lib/librte_eal/windows/eal/include/pthread.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#ifndef _PTHREAD_H_ -#define _PTHREAD_H_ - -/** - * This file is required to support the common code in eal_common_proc.c, - * eal_common_thread.c and common\include\rte_per_lcore.h as Microsoft libc - * does not contain pthread.h. This may be removed in future releases. - */ -#ifdef __cplusplus -extern "C" { -#endif - -#include - -#define PTHREAD_BARRIER_SERIAL_THREAD TRUE - -/* defining pthread_t type on Windows since there is no in Microsoft libc*/ -typedef uintptr_t pthread_t; - -/* defining pthread_attr_t type on Windows since there is no in Microsoft libc*/ -typedef void *pthread_attr_t; - -typedef SYNCHRONIZATION_BARRIER pthread_barrier_t; - -#define pthread_barrier_init(barrier, attr, count) \ - InitializeSynchronizationBarrier(barrier, count, -1) -#define pthread_barrier_wait(barrier) EnterSynchronizationBarrier(barrier, \ - SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY) -#define pthread_barrier_destroy(barrier) \ - DeleteSynchronizationBarrier(barrier) -#define pthread_cancel(thread) TerminateThread((HANDLE) thread, 0) - -/* pthread function overrides */ -#define pthread_self() \ - ((pthread_t)GetCurrentThreadId()) -#define pthread_setaffinity_np(thread, size, cpuset) \ - eal_set_thread_affinity_mask(thread, (unsigned long *) cpuset) -#define pthread_getaffinity_np(thread, size, cpuset) \ - eal_get_thread_affinity_mask(thread, (unsigned long *) cpuset) -#define pthread_create(threadid, threadattr, threadfunc, args) \ - eal_create_thread(threadid, threadfunc, args) - -static inline int -eal_set_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) -{ - SetThreadAffinityMask((HANDLE) threadid, *cpuset); - return 0; -} - -static inline int -eal_get_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) -{ - /* Workaround for the lack of a GetThreadAffinityMask() - *API in Windows - */ - /* obtain previous mask by setting dummy mask */ - DWORD dwprevaffinitymask = - SetThreadAffinityMask((HANDLE) threadid, 0x1); - /* set it back! */ - SetThreadAffinityMask((HANDLE) threadid, dwprevaffinitymask); - *cpuset = dwprevaffinitymask; - return 0; -} - -static inline int -eal_create_thread(void *threadid, void *threadfunc, void *args) -{ - HANDLE hThread; - hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)threadfunc, - args, 0, (LPDWORD)threadid); - if (hThread) { - SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); - SetThreadPriority(hThread, THREAD_PRIORITY_TIME_CRITICAL); - } - return ((hThread != NULL) ? 0 : E_FAIL); -} - -static inline int -pthread_join(pthread_t thread __attribute__((__unused__)), - void **value_ptr __attribute__((__unused__))) -{ - return 0; -} - -#ifdef __cplusplus -} -#endif - -#endif /* _PTHREAD_H_ */ diff --git a/lib/librte_eal/windows/eal/include/regex.h b/lib/librte_eal/windows/eal/include/regex.h deleted file mode 100644 index 827f938414..0000000000 --- a/lib/librte_eal/windows/eal/include/regex.h +++ /dev/null @@ -1,90 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#ifndef _REGEX_H_ -#define _REGEX_H_ - -/** - * This file is required to support the common code in eal_common_log.c - * as Microsoft libc does not contain regex.h. This may be removed in - * future releases. - */ -#ifdef __cplusplus -extern "C" { -#endif - -#define REG_NOMATCH 1 -#define REG_ESPACE 12 - -#include - -/* defining regex_t for Windows */ -typedef void *regex_t; -/* defining regmatch_t for Windows */ -typedef void *regmatch_t; - -/** - * The regcomp() function will compile the regular expression - * contained in the string pointed to by the pattern argument - * and place the results in the structure pointed to by preg. - * The cflags argument is the bitwise inclusive OR of zero or - * more of the flags - */ -static inline int regcomp(__rte_unused regex_t *preg, - __rte_unused const char *regex, __rte_unused int cflags) -{ - /* TODO */ - /* This is a stub, not the expected result */ - return REG_ESPACE; -} - -/** - * The regexec() function compares the null-terminated string - * specified by string with the compiled regular expression - * preg initialised by a previous call to regcomp(). If it finds - * a match, regexec() returns 0; otherwise it returns non-zero - * indicating either no match or an error. The eflags argument - * is the bitwise inclusive OR of zero or more of the flags. - */ -static inline int regexec(__rte_unused const regex_t *preg, - __rte_unused const char *string, __rte_unused size_t nmatch, - __rte_unused regmatch_t pmatch[], __rte_unused int eflags) -{ - /* TODO */ - /* This is a stub, not the expected result */ - return REG_NOMATCH; -} - -/** - * The regerror() function provides a mapping from error codes - * returned by regcomp() and regexec() to unspecified printable strings. - */ -static inline size_t regerror(__rte_unused int errcode, - __rte_unused const regex_t *preg, char *errbuf, - __rte_unused size_t errbuf_size) -{ - /* TODO */ - /* This is a stub, not the expected result */ - if (errbuf) { - *errbuf = '\0'; - return 1; - } - return 0; -} - -/** - * The regfree() function frees any memory allocated by regcomp() - * associated with preg. - */ -static inline void regfree(__rte_unused regex_t *preg) -{ - /* TODO */ - /* This is a stub, not the expected result */ -} - -#ifdef __cplusplus -} -#endif - -#endif /* _REGEX_H_ */ diff --git a/lib/librte_eal/windows/eal/include/rte_os.h b/lib/librte_eal/windows/eal/include/rte_os.h deleted file mode 100644 index e1e0378e6f..0000000000 --- a/lib/librte_eal/windows/eal/include/rte_os.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2010-2019 Intel Corporation - */ - -#ifndef _RTE_OS_H_ -#define _RTE_OS_H_ - -/** - * This is header should contain any function/macro definition - * which are not supported natively or named differently in the - * Windows OS. Functions will be added in future releases. - */ - -#ifdef __cplusplus -extern "C" { -#endif - -#include -#include -#include -#include - -/* limits.h replacement */ -#include -#ifndef PATH_MAX -#define PATH_MAX _MAX_PATH -#endif - -#define strerror_r(a, b, c) strerror_s(b, c, a) - -/* strdup is deprecated in Microsoft libc and _strdup is preferred */ -#define strdup(str) _strdup(str) - -typedef SSIZE_T ssize_t; - -#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) - -#define index(a, b) strchr(a, b) -#define rindex(a, b) strrchr(a, b) - -#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count) - -/** - * Create a thread. - * This function is private to EAL. - * - * @param thread - * The location to store the thread id if successful. - * @return - * 0 for success, -1 if the thread is not created. - */ -int eal_thread_create(pthread_t *thread); - -/** - * Create a map of processors and cores on the system. - * This function is private to EAL. - */ -void eal_create_cpu_map(void); - -#ifndef RTE_TOOLCHAIN_GCC -static inline int -asprintf(char **buffer, const char *format, ...) -{ - int size, ret; - va_list arg; - - va_start(arg, format); - size = vsnprintf(NULL, 0, format, arg); - va_end(arg); - if (size < 0) - return -1; - size++; - - *buffer = malloc(size); - if (*buffer == NULL) - return -1; - - va_start(arg, format); - ret = vsnprintf(*buffer, size, format, arg); - va_end(arg); - if (ret != size - 1) { - free(*buffer); - return -1; - } - return ret; -} -#endif /* RTE_TOOLCHAIN_GCC */ - -/* cpu_set macros implementation */ -#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) -#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) -#define RTE_CPU_FILL(set) CPU_FILL(set) -#define RTE_CPU_NOT(dst, src) CPU_NOT(dst, src) - -#ifdef __cplusplus -} -#endif - -#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/windows/eal/include/sched.h b/lib/librte_eal/windows/eal/include/sched.h deleted file mode 100644 index fbe07f742c..0000000000 --- a/lib/librte_eal/windows/eal/include/sched.h +++ /dev/null @@ -1,92 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#ifndef _SCHED_H_ -#define _SCHED_H_ - -/** - * This file is added to support the common code in eal_common_thread.c - * as Microsoft libc does not contain sched.h. This may be removed - * in future releases. - */ -#ifdef __cplusplus -extern "C" { -#endif - -#ifndef CPU_SETSIZE -#define CPU_SETSIZE RTE_MAX_LCORE -#endif - -#define _BITS_PER_SET (sizeof(long long) * 8) -#define _BIT_SET_MASK (_BITS_PER_SET - 1) - -#define _NUM_SETS(b) (((b) + _BIT_SET_MASK) / _BITS_PER_SET) -#define _WHICH_SET(b) ((b) / _BITS_PER_SET) -#define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1)) - -typedef struct _rte_cpuset_s { - long long _bits[_NUM_SETS(CPU_SETSIZE)]; -} rte_cpuset_t; - -#define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b))) - -#define CPU_ZERO(s) \ - do { \ - unsigned int _i; \ - \ - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ - (s)->_bits[_i] = 0LL; \ - } while (0) - -#define CPU_ISSET(b, s) (((s)->_bits[_WHICH_SET(b)] & \ - (1LL << _WHICH_BIT(b))) != 0LL) - -static inline int -count_cpu(rte_cpuset_t *s) -{ - unsigned int _i; - int count = 0; - - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) - if (CPU_ISSET(_i, s) != 0LL) - count++; - return count; -} -#define CPU_COUNT(s) count_cpu(s) - -#define CPU_AND(dst, src1, src2) \ -do { \ - unsigned int _i; \ - \ - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ - (dst)->_bits[_i] = (src1)->_bits[_i] & (src2)->_bits[_i]; \ -} while (0) - -#define CPU_OR(dst, src1, src2) \ -do { \ - unsigned int _i; \ - \ - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ - (dst)->_bits[_i] = (src1)->_bits[_i] | (src2)->_bits[_i]; \ -} while (0) - -#define CPU_FILL(s) \ -do { \ - unsigned int _i; \ - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ - (s)->_bits[_i] = -1LL; \ -} while (0) - -#define CPU_NOT(dst, src) \ -do { \ - unsigned int _i; \ - for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ - (dst)->_bits[_i] = (src)->_bits[_i] ^ -1LL; \ -} while (0) - -#ifdef __cplusplus -} -#endif - -#endif /* _SCHED_H_ */ diff --git a/lib/librte_eal/windows/eal/include/sys/queue.h b/lib/librte_eal/windows/eal/include/sys/queue.h deleted file mode 100644 index a65949a78a..0000000000 --- a/lib/librte_eal/windows/eal/include/sys/queue.h +++ /dev/null @@ -1,302 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * - * Copyright (c) 1991, 1993 - * The Regents of the University of California. All rights reserved. - */ - -#ifndef _SYS_QUEUE_H_ -#define _SYS_QUEUE_H_ - -/* - * This file defines tail queues. - * - * A tail queue is headed by a pair of pointers, one to the head of the - * list and the other to the tail of the list. The elements are doubly - * linked so that an arbitrary element can be removed without a need to - * traverse the list. New elements can be added to the list before or - * after an existing element, at the head of the list, or at the end of - * the list. A tail queue may be traversed in either direction. - * - * Below is a summary of implemented functions where: - * + means the macro is available - * - means the macro is not available - * s means the macro is available but is slow (runs in O(n) time) - * - * TAILQ - * _HEAD + - * _CLASS_HEAD + - * _HEAD_INITIALIZER + - * _ENTRY + - * _CLASS_ENTRY + - * _INIT + - * _EMPTY + - * _FIRST + - * _NEXT + - * _PREV + - * _LAST + - * _LAST_FAST + - * _FOREACH + - * _FOREACH_FROM + - * _FOREACH_SAFE + - * _FOREACH_FROM_SAFE + - * _FOREACH_REVERSE + - * _FOREACH_REVERSE_FROM + - * _FOREACH_REVERSE_SAFE + - * _FOREACH_REVERSE_FROM_SAFE + - * _INSERT_HEAD + - * _INSERT_BEFORE + - * _INSERT_AFTER + - * _INSERT_TAIL + - * _CONCAT + - * _REMOVE_AFTER - - * _REMOVE_HEAD - - * _REMOVE + - * _SWAP + - * - */ - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * List definitions. - */ -#define LIST_HEAD(name, type) \ -struct name { \ - struct type *lh_first; /* first element */ \ -} - -#define QMD_TRACE_ELEM(elem) -#define QMD_TRACE_HEAD(head) -#define TRACEBUF -#define TRACEBUF_INITIALIZER - -#define TRASHIT(x) -#define QMD_IS_TRASHED(x) 0 - -#define QMD_SAVELINK(name, link) - -#ifdef __cplusplus -/* - * In C++ there can be structure lists and class lists: - */ -#define QUEUE_TYPEOF(type) type -#else -#define QUEUE_TYPEOF(type) struct type -#endif - -/* - * Tail queue declarations. - */ -#define TAILQ_HEAD(name, type) \ -struct name { \ - struct type *tqh_first; /* first element */ \ - struct type **tqh_last; /* addr of last next element */ \ - TRACEBUF \ -} - -#define TAILQ_CLASS_HEAD(name, type) \ -struct name { \ - class type *tqh_first; /* first element */ \ - class type **tqh_last; /* addr of last next element */ \ - TRACEBUF \ -} - -#define TAILQ_HEAD_INITIALIZER(head) \ - { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } - -#define TAILQ_ENTRY(type) \ -struct { \ - struct type *tqe_next; /* next element */ \ - struct type **tqe_prev; /* address of previous next element */ \ - TRACEBUF \ -} - -#define TAILQ_CLASS_ENTRY(type) \ -struct { \ - class type *tqe_next; /* next element */ \ - class type **tqe_prev; /* address of previous next element */ \ - TRACEBUF \ -} - -/* - * Tail queue functions. - */ -#define QMD_TAILQ_CHECK_HEAD(head, field) -#define QMD_TAILQ_CHECK_TAIL(head, headname) -#define QMD_TAILQ_CHECK_NEXT(elm, field) -#define QMD_TAILQ_CHECK_PREV(elm, field) - -#define TAILQ_CONCAT(head1, head2, field) do { \ - if (!TAILQ_EMPTY(head2)) { \ - *(head1)->tqh_last = (head2)->tqh_first; \ - (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ - (head1)->tqh_last = (head2)->tqh_last; \ - TAILQ_INIT((head2)); \ - QMD_TRACE_HEAD(head1); \ - QMD_TRACE_HEAD(head2); \ - } \ -} while (0) - -#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) - -#define TAILQ_FIRST(head) ((head)->tqh_first) - -#define TAILQ_FOREACH(var, head, field) \ - for ((var) = TAILQ_FIRST((head)); \ - (var); \ - (var) = TAILQ_NEXT((var), field)) - -#define TAILQ_FOREACH_FROM(var, head, field) \ - for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ - (var); \ - (var) = TAILQ_NEXT((var), field)) - -#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = TAILQ_FIRST((head)); \ - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ - for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ - (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ - (var) = (tvar)) - -#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var); \ - (var) = TAILQ_PREV((var), headname, field)) - -#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ - for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ - (var); \ - (var) = TAILQ_PREV((var), headname, field)) - -#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ - for ((var) = TAILQ_LAST((head), headname); \ - (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ - (var) = (tvar)) - -#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ - for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ - (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ - (var) = (tvar)) - -#define TAILQ_INIT(head) do { \ - TAILQ_FIRST((head)) = NULL; \ - (head)->tqh_last = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ -} while (0) - -#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ - QMD_TAILQ_CHECK_NEXT(listelm, field); \ - TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field); \ - if (TAILQ_NEXT((listelm), field) != NULL) \ - TAILQ_NEXT((elm), field)->field.tqe_prev = \ - &TAILQ_NEXT((elm), field); \ - else { \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - } \ - TAILQ_NEXT((listelm), field) = (elm); \ - (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&(listelm)->field); \ -} while (0) - -#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ - QMD_TAILQ_CHECK_PREV(listelm, field); \ - (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ - TAILQ_NEXT((elm), field) = (listelm); \ - *(listelm)->field.tqe_prev = (elm); \ - (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_ELEM(&(elm)->field); \ - QMD_TRACE_ELEM(&(listelm)->field); \ -} while (0) - -#define TAILQ_INSERT_HEAD(head, elm, field) do { \ - QMD_TAILQ_CHECK_HEAD(head, field); \ - TAILQ_NEXT((elm), field) = TAILQ_FIRST((head)); \ - if (TAILQ_FIRST((head)) != NULL) \ - TAILQ_FIRST((head))->field.tqe_prev = \ - &TAILQ_NEXT((elm), field); \ - else \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - TAILQ_FIRST((head)) = (elm); \ - (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_INSERT_TAIL(head, elm, field) do { \ - QMD_TAILQ_CHECK_TAIL(head, field); \ - TAILQ_NEXT((elm), field) = NULL; \ - (elm)->field.tqe_prev = (head)->tqh_last; \ - *(head)->tqh_last = (elm); \ - (head)->tqh_last = &TAILQ_NEXT((elm), field); \ - QMD_TRACE_HEAD(head); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_LAST(head, headname) \ - (*(((struct headname *)((head)->tqh_last))->tqh_last)) - -/* - * The FAST function is fast in that it causes no data access other - * then the access to the head. The standard LAST function above - * will cause a data access of both the element you want and - * the previous element. FAST is very useful for instances when - * you may want to prefetch the last data element. - */ -#define TAILQ_LAST_FAST(head, type, field) \ - (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, \ - QUEUE_TYPEOF(type), field.tqe_next)) - -#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) - -#define TAILQ_PREV(elm, headname, field) \ - (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) - -#define TAILQ_REMOVE(head, elm, field) do { \ - QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ - QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ - QMD_TAILQ_CHECK_NEXT(elm, field); \ - QMD_TAILQ_CHECK_PREV(elm, field); \ - if ((TAILQ_NEXT((elm), field)) != NULL) \ - TAILQ_NEXT((elm), field)->field.tqe_prev = \ - (elm)->field.tqe_prev; \ - else { \ - (head)->tqh_last = (elm)->field.tqe_prev; \ - QMD_TRACE_HEAD(head); \ - } \ - *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ - TRASHIT(*oldnext); \ - TRASHIT(*oldprev); \ - QMD_TRACE_ELEM(&(elm)->field); \ -} while (0) - -#define TAILQ_SWAP(head1, head2, type, field) do { \ - QUEUE_TYPEOF(type) * swap_first = (head1)->tqh_first; \ - QUEUE_TYPEOF(type) * *swap_last = (head1)->tqh_last; \ - (head1)->tqh_first = (head2)->tqh_first; \ - (head1)->tqh_last = (head2)->tqh_last; \ - (head2)->tqh_first = swap_first; \ - (head2)->tqh_last = swap_last; \ - swap_first = (head1)->tqh_first; \ - if (swap_first != NULL) \ - swap_first->field.tqe_prev = &(head1)->tqh_first; \ - else \ - (head1)->tqh_last = &(head1)->tqh_first; \ - swap_first = (head2)->tqh_first; \ - if (swap_first != NULL) \ - swap_first->field.tqe_prev = &(head2)->tqh_first; \ - else \ - (head2)->tqh_last = &(head2)->tqh_first; \ -} while (0) - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_QUEUE_H_ */ diff --git a/lib/librte_eal/windows/eal/include/unistd.h b/lib/librte_eal/windows/eal/include/unistd.h deleted file mode 100644 index 757b7f3c57..0000000000 --- a/lib/librte_eal/windows/eal/include/unistd.h +++ /dev/null @@ -1,12 +0,0 @@ -/* SPDX-License-Identifier: BSD-3-Clause - * Copyright(c) 2019 Intel Corporation - */ - -#ifndef _UNISTD_H_ -#define _UNISTD_H_ -/** - * This file is added to support common code in eal_common_lcore.c - * as Microsoft libc does not contain unistd.h. This may be removed - * in future releases. - */ -#endif /* _UNISTD_H_ */ diff --git a/lib/librte_eal/windows/eal/meson.build b/lib/librte_eal/windows/eal/meson.build deleted file mode 100644 index 46ccdb3343..0000000000 --- a/lib/librte_eal/windows/eal/meson.build +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright(c) 2019 Intel Corporation - -eal_inc += include_directories('include') - -env_objs = [] -env_headers = files( - 'include/rte_os.h', -) -env_sources = files('eal.c', - 'eal_debug.c', - 'eal_lcore.c', - 'eal_thread.c', - 'getopt.c', -) diff --git a/lib/librte_eal/windows/eal_debug.c b/lib/librte_eal/windows/eal_debug.c new file mode 100644 index 0000000000..669be6ff97 --- /dev/null +++ b/lib/librte_eal/windows/eal_debug.c @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include +#include +#include + + /* call abort(), it will generate a coredump if enabled */ +void +__rte_panic(const char *funcname, const char *format, ...) +{ + va_list ap; + + rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname); + va_start(ap, format); + rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap); + va_end(ap); + abort(); +} diff --git a/lib/librte_eal/windows/eal_lcore.c b/lib/librte_eal/windows/eal_lcore.c new file mode 100644 index 0000000000..b3a6c63afa --- /dev/null +++ b/lib/librte_eal/windows/eal_lcore.c @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include + +#include + +#include "eal_private.h" +#include "eal_thread.h" + +/* global data structure that contains the CPU map */ +static struct _wcpu_map { + unsigned int total_procs; + unsigned int proc_sockets; + unsigned int proc_cores; + unsigned int reserved; + struct _win_lcore_map { + uint8_t socket_id; + uint8_t core_id; + } wlcore_map[RTE_MAX_LCORE]; +} wcpu_map = { 0 }; + +/* + * Create a map of all processors and associated cores on the system + */ +void +eal_create_cpu_map() +{ + wcpu_map.total_procs = + GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + + LOGICAL_PROCESSOR_RELATIONSHIP lprocRel; + DWORD lprocInfoSize = 0; + BOOL ht_enabled = FALSE; + + /* First get the processor package information */ + lprocRel = RelationProcessorPackage; + /* Determine the size of buffer we need (pass NULL) */ + GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); + wcpu_map.proc_sockets = lprocInfoSize / 48; + + lprocInfoSize = 0; + /* Next get the processor core information */ + lprocRel = RelationProcessorCore; + GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize); + wcpu_map.proc_cores = lprocInfoSize / 48; + + if (wcpu_map.total_procs > wcpu_map.proc_cores) + ht_enabled = TRUE; + + /* Distribute the socket and core ids appropriately + * across the logical cores. For now, split the cores + * equally across the sockets. + */ + unsigned int lcore = 0; + for (unsigned int socket = 0; socket < + wcpu_map.proc_sockets; ++socket) { + for (unsigned int core = 0; + core < (wcpu_map.proc_cores / wcpu_map.proc_sockets); + ++core) { + wcpu_map.wlcore_map[lcore] + .socket_id = socket; + wcpu_map.wlcore_map[lcore] + .core_id = core; + lcore++; + if (ht_enabled) { + wcpu_map.wlcore_map[lcore] + .socket_id = socket; + wcpu_map.wlcore_map[lcore] + .core_id = core; + lcore++; + } + } + } +} + +/* + * Check if a cpu is present by the presence of the cpu information for it + */ +int +eal_cpu_detected(unsigned int lcore_id) +{ + return (lcore_id < wcpu_map.total_procs); +} + +/* + * Get CPU socket id for a logical core + */ +unsigned +eal_cpu_socket_id(unsigned int lcore_id) +{ + return wcpu_map.wlcore_map[lcore_id].socket_id; +} + +/* + * Get CPU socket id (NUMA node) for a logical core + */ +unsigned +eal_cpu_core_id(unsigned int lcore_id) +{ + return wcpu_map.wlcore_map[lcore_id].core_id; +} diff --git a/lib/librte_eal/windows/eal_thread.c b/lib/librte_eal/windows/eal_thread.c new file mode 100644 index 0000000000..9e4bbaa082 --- /dev/null +++ b/lib/librte_eal/windows/eal_thread.c @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eal_private.h" + +RTE_DEFINE_PER_LCORE(unsigned int, _lcore_id) = LCORE_ID_ANY; +RTE_DEFINE_PER_LCORE(unsigned int, _socket_id) = (unsigned int)SOCKET_ID_ANY; +RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset); + +/* + * Send a message to a slave lcore identified by slave_id to call a + * function f with argument arg. Once the execution is done, the + * remote lcore switch in FINISHED state. + */ +int +rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned int slave_id) +{ + int n; + char c = 0; + int m2s = lcore_config[slave_id].pipe_master2slave[1]; + int s2m = lcore_config[slave_id].pipe_slave2master[0]; + + if (lcore_config[slave_id].state != WAIT) + return -EBUSY; + + lcore_config[slave_id].f = f; + lcore_config[slave_id].arg = arg; + + /* send message */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = _write(m2s, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + /* wait ack */ + do { + n = _read(s2m, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + return 0; +} + +void +eal_thread_init_master(unsigned int lcore_id) +{ + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; +} + +static inline pthread_t +eal_thread_self(void) +{ + return GetCurrentThreadId(); +} + +/* main loop of threads */ +void * +eal_thread_loop(void *arg __rte_unused) +{ + char c; + int n, ret; + unsigned int lcore_id; + pthread_t thread_id; + int m2s, s2m; + char cpuset[RTE_CPU_AFFINITY_STR_LEN]; + + thread_id = eal_thread_self(); + + /* retrieve our lcore_id from the configuration structure */ + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (thread_id == lcore_config[lcore_id].thread_id) + break; + } + if (lcore_id == RTE_MAX_LCORE) + rte_panic("cannot retrieve lcore id\n"); + + m2s = lcore_config[lcore_id].pipe_master2slave[0]; + s2m = lcore_config[lcore_id].pipe_slave2master[1]; + + /* set the lcore ID in per-lcore memory area */ + RTE_PER_LCORE(_lcore_id) = lcore_id; + + RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s])\n", + lcore_id, (uintptr_t)thread_id, cpuset); + + /* read on our pipe to get commands */ + while (1) { + void *fct_arg; + + /* wait command */ + do { + n = _read(m2s, &c, 1); + } while (n < 0 && errno == EINTR); + + if (n <= 0) + rte_panic("cannot read on configuration pipe\n"); + + lcore_config[lcore_id].state = RUNNING; + + /* send ack */ + n = 0; + while (n == 0 || (n < 0 && errno == EINTR)) + n = _write(s2m, &c, 1); + if (n < 0) + rte_panic("cannot write on configuration pipe\n"); + + if (lcore_config[lcore_id].f == NULL) + rte_panic("NULL function pointer\n"); + + /* call the function and store the return value */ + fct_arg = lcore_config[lcore_id].arg; + ret = lcore_config[lcore_id].f(fct_arg); + lcore_config[lcore_id].ret = ret; + rte_wmb(); + + /* when a service core returns, it should go directly to WAIT + * state, because the application will not lcore_wait() for it. + */ + if (lcore_config[lcore_id].core_role == ROLE_SERVICE) + lcore_config[lcore_id].state = WAIT; + else + lcore_config[lcore_id].state = FINISHED; + } +} + +/* function to create threads */ +int +eal_thread_create(pthread_t *thread) +{ + HANDLE th; + + th = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)eal_thread_loop, + NULL, 0, (LPDWORD)thread); + if (!th) + return -1; + + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(th, THREAD_PRIORITY_TIME_CRITICAL); + + return 0; +} + +int +rte_thread_setname(__rte_unused pthread_t id, __rte_unused const char *name) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return 0; +} diff --git a/lib/librte_eal/windows/getopt.c b/lib/librte_eal/windows/getopt.c new file mode 100644 index 0000000000..170c9b5e0c --- /dev/null +++ b/lib/librte_eal/windows/getopt.c @@ -0,0 +1,470 @@ +/* SPDX-License-Identifier: ISC AND BSD-2-Clause + * Copyright (c) 2002 Todd C. Miller + * + * Sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F39502-99-1-0512. + */ +/* + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + */ + +#include + +#ifdef NEED_USUAL_GETOPT + +#include +#include + +const char *optarg; /* argument associated with option */ +int opterr = 1; /* if error message should be printed */ +int optind = 1; /* index into parent argv vector */ +int optopt = '?'; /* character checked for validity */ + +static void pass(void) {} +#define warnx(a, ...) pass() + +#define PRINT_ERROR ((opterr) && (*options != ':')) + +#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */ +#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */ +#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */ + +/* return values */ +#define BADCH ((int)'?') +#define BADARG ((*options == ':') ? (int)':' : (int)'?') +#define INORDER 1 + +#define EMSG "" + +static const char *place = EMSG; /* option letter processing */ + +/* XXX: set optreset to 1 rather than these two */ +static int nonopt_start = -1; /* first non option argument (for permute) */ +static int nonopt_end = -1; /* first option after non options (for permute) */ + +/* Error messages */ +static const char recargchar[] = "option requires an argument -- %c"; +static const char recargstring[] = "option requires an argument -- %s"; +static const char ambig[] = "ambiguous option -- %.*s"; +static const char noarg[] = "option doesn't take an argument -- %.*s"; +static const char illoptchar[] = "unknown option -- %c"; +static const char illoptstring[] = "unknown option -- %s"; + +/* + * Compute the greatest common divisor of a and b. + */ +static int +gcd(int a, int b) +{ + int c; + + c = a % b; + while (c != 0) { + a = b; + b = c; + c = a % b; + } + + return (b); +} + +/* + * Exchange the block from nonopt_start to nonopt_end with the block + * from nonopt_end to opt_end (keeping the same order of arguments + * in each block). + */ +static void +permute_args(int panonopt_start, int panonopt_end, int opt_end, + char **nargv) +{ + int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos; + char *swap; + + /* + * compute lengths of blocks and number and size of cycles + */ + nnonopts = panonopt_end - panonopt_start; + nopts = opt_end - panonopt_end; + ncycle = gcd(nnonopts, nopts); + cyclelen = (opt_end - panonopt_start) / ncycle; + + for (i = 0; i < ncycle; i++) { + cstart = panonopt_end+i; + pos = cstart; + for (j = 0; j < cyclelen; j++) { + if (pos >= panonopt_end) + pos -= nnonopts; + else + pos += nopts; + swap = nargv[pos]; + /* LINTED const cast */ + ((char **) nargv)[pos] = nargv[cstart]; + /* LINTED const cast */ + ((char **)nargv)[cstart] = swap; + } + } +} + +/* + * parse_long_options -- + * Parse long options in argc/argv argument vector. + * Returns -1 if short_too is set and the option does not match long_options. + */ +static int +parse_long_options(char **nargv, const char *options, + const struct option *long_options, int *idx, int short_too) +{ + const char *current_argv; + char *has_equal; + size_t current_argv_len; + int i, match; + + current_argv = place; + match = -1; + + optind++; + + has_equal = strchr(current_argv, '='); + if (has_equal != NULL) { + /* argument found (--option=arg) */ + current_argv_len = has_equal - current_argv; + has_equal++; + } else + current_argv_len = strlen(current_argv); + + for (i = 0; long_options[i].name; i++) { + /* find matching long option */ + if (strncmp(current_argv, long_options[i].name, + current_argv_len)) + continue; + + if (strlen(long_options[i].name) == current_argv_len) { + /* exact match */ + match = i; + break; + } + /* + * If this is a known short option, don't allow + * a partial match of a single character. + */ + if (short_too && current_argv_len == 1) + continue; + + if (match == -1) /* partial match */ + match = i; + else { + /* ambiguous abbreviation */ + if (PRINT_ERROR) + warnx(ambig, (int)current_argv_len, + current_argv); + optopt = 0; + return BADCH; + } + } + if (match != -1) { /* option found */ + if (long_options[match].has_arg == no_argument + && has_equal) { + if (PRINT_ERROR) + warnx(noarg, (int)current_argv_len, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + return BADARG; + } + if (long_options[match].has_arg == required_argument || + long_options[match].has_arg == optional_argument) { + if (has_equal) + optarg = has_equal; + else if (long_options[match].has_arg == + required_argument) { + /* + * optional argument doesn't use next nargv + */ + optarg = nargv[optind++]; + } + } + if ((long_options[match].has_arg == required_argument) + && (optarg == NULL)) { + /* + * Missing argument; leading ':' indicates no error + * should be generated. + */ + if (PRINT_ERROR) + warnx(recargstring, + current_argv); + /* + * XXX: GNU sets optopt to val regardless of flag + */ + if (long_options[match].flag == NULL) + optopt = long_options[match].val; + else + optopt = 0; + --optind; + return BADARG; + } + } else { /* unknown option */ + if (short_too) { + --optind; + return (-1); + } + if (PRINT_ERROR) + warnx(illoptstring, current_argv); + optopt = 0; + return BADCH; + } + if (idx) + *idx = match; + if (long_options[match].flag) { + *long_options[match].flag = long_options[match].val; + return 0; + } else + return (long_options[match].val); +} + +/* + * getopt_internal -- + * Parse argc/argv argument vector. Called by user level routines. + */ +static int +getopt_internal(int nargc, char **nargv, const char *options, + const struct option *long_options, int *idx, int flags) +{ + char *oli; /* option letter list index */ + int optchar, short_too; + static int posixly_correct = -1; + char *buf; + size_t len; + int optreset = 0; + + if (options == NULL) + return (-1); + + /* + * Disable GNU extensions if POSIXLY_CORRECT is set or options + * string begins with a '+'. + */ + if (posixly_correct == -1) + posixly_correct = _dupenv_s(&buf, &len, "POSIXLY_CORRECT"); + if (!posixly_correct || *options == '+') + flags &= ~FLAG_PERMUTE; + else if (*options == '-') + flags |= FLAG_ALLARGS; + if (*options == '+' || *options == '-') + options++; + if (!posixly_correct) + free(buf); + /* + * reset if requested + */ + if (optind == 0) + optind = optreset = 1; + + optarg = NULL; + if (optreset) + nonopt_start = nonopt_end = -1; +start: + if (optreset || !*place) { /* update scanning pointer */ + optreset = 0; + if (optind >= nargc) { /* end of argument vector */ + place = EMSG; + if (nonopt_end != -1) { + /* do permutation, if we have to */ + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } else if (nonopt_start != -1) { + /* + * If we skipped non-options, set optind + * to the first of them. + */ + optind = nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + place = nargv[optind]; + if (*place != '-' || + (place[1] == '\0' && strchr(options, '-') == NULL)) { + place = EMSG; /* found non-option */ + if (flags & FLAG_ALLARGS) { + /* + * GNU extension: + * return non-option as argument to option 1 + */ + optarg = nargv[optind++]; + return INORDER; + } + if (!(flags & FLAG_PERMUTE)) { + /* + * If no permutation wanted, stop parsing + * at first non-option. + */ + return (-1); + } + /* do permutation */ + if (nonopt_start == -1) + nonopt_start = optind; + else if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + nonopt_start = optind - + (nonopt_end - nonopt_start); + nonopt_end = -1; + } + optind++; + /* process next argument */ + goto start; + } + if (nonopt_start != -1 && nonopt_end == -1) + nonopt_end = optind; + + /* + * If we have "-" do nothing, if "--" we are done. + */ + if (place[1] != '\0' && *++place == '-' && place[1] == '\0') { + optind++; + place = EMSG; + /* + * We found an option (--), so if we skipped + * non-options, we have to permute. + */ + if (nonopt_end != -1) { + permute_args(nonopt_start, nonopt_end, + optind, nargv); + optind -= nonopt_end - nonopt_start; + } + nonopt_start = nonopt_end = -1; + return (-1); + } + } + + /* + * Check long options if: + * 1) we were passed some + * 2) the arg is not just "-" + * 3) either the arg starts with -- we are getopt_long_only() + */ + if (long_options != NULL && place != nargv[optind] && + (*place == '-' || (flags & FLAG_LONGONLY))) { + short_too = 0; + if (*place == '-') + place++; /* --foo long option */ + else if (*place != ':' && strchr(options, *place) != NULL) + short_too = 1; /* could be short option too */ + + optchar = parse_long_options(nargv, options, long_options, + idx, short_too); + if (optchar != -1) { + place = EMSG; + return optchar; + } + } + + optchar = (int)*place++; + oli = strchr(options, optchar); + if (optchar == (int)':' || + (optchar == (int)'-' && *place != '\0') || + oli == NULL) { + /* + * If the user specified "-" and '-' isn't listed in + * options, return -1 (non-option) as per POSIX. + * Otherwise, it is an unknown option character (or ':'). + */ + if (optchar == (int)'-' && *place == '\0') + return (-1); + if (!*place) + ++optind; + if (PRINT_ERROR) + warnx(illoptchar, optchar); + optopt = optchar; + return BADCH; + } + if (long_options != NULL && optchar == 'W' && oli[1] == ';') { + /* -W long-option */ + if (*place) + ; + else if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return BADARG; + } /* white space */ + place = nargv[optind]; + optchar = parse_long_options(nargv, options, long_options, + idx, 0); + place = EMSG; + return optchar; + } + if (*++oli != ':') { /* doesn't take argument */ + if (!*place) + ++optind; + } else { /* takes (optional) argument */ + optarg = NULL; + if (*place) /* no white space */ + optarg = place; + else if (oli[1] != ':') { /* arg not optional */ + if (++optind >= nargc) { /* no arg */ + place = EMSG; + if (PRINT_ERROR) + warnx(recargchar, optchar); + optopt = optchar; + return BADARG; + } + optarg = nargv[optind]; + } + place = EMSG; + ++optind; + } + /* dump back option letter */ + return optchar; +} + +/* + * getopt -- + * Parse argc/argv argument vector. + */ +int +getopt(int nargc, char *nargv[], const char *options) +{ + return getopt_internal(nargc, nargv, options, NULL, NULL, + FLAG_PERMUTE); +} + +/* + * getopt_long -- + * Parse argc/argv argument vector. + */ +int +getopt_long(int nargc, char *nargv[], const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE)); +} + +/* + * getopt_long_only -- + * Parse argc/argv argument vector. + */ +int +getopt_long_only(int nargc, char *nargv[], const char *options, + const struct option *long_options, int *idx) +{ + + return (getopt_internal(nargc, nargv, options, long_options, idx, + FLAG_PERMUTE|FLAG_LONGONLY)); +} + +#endif /* NEED_USUAL_GETOPT */ diff --git a/lib/librte_eal/windows/include/dirent.h b/lib/librte_eal/windows/include/dirent.h new file mode 100644 index 0000000000..869a598378 --- /dev/null +++ b/lib/librte_eal/windows/include/dirent.h @@ -0,0 +1,664 @@ +/* SPDX-License-Identifier: MIT + * Dirent interface for Microsoft Visual Studio + * Version 1.21 + * Copyright (C) 2006-2012 Toni Ronkko + * https://github.com/tronkko/dirent + */ + +#ifndef DIRENT_H +#define DIRENT_H + +/* + * Include windows.h without Windows Sockets 1.1 to prevent conflicts with + * Windows Sockets 2.0. + */ +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Maximum length of file name */ +#if !defined(PATH_MAX) +# define PATH_MAX MAX_PATH +#endif + +/* File type flags for d_type */ +#define DT_UNKNOWN 0 +#define DT_REG S_IFREG +#define DT_DIR S_IFDIR +#define DT_CHR S_IFCHR + +/* + * File type macros. Note that block devices, sockets and links cannot be + * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are + * only defined for compatibility. These macros should always return false + * on Windows. + */ +#if !defined(S_ISDIR) +# define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR) +#endif +#if !defined(S_ISREG) +# define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG) +#endif + +/* Wide-character version */ +struct _wdirent { + /* Always zero */ + long d_ino; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + wchar_t d_name[PATH_MAX]; +}; +typedef struct _wdirent _wdirent; + +struct _WDIR { + /* Current directory entry */ + struct _wdirent ent; + + /* Private file data */ + WIN32_FIND_DATAW data; + + /* True if data is valid */ + int cached; + + /* Win32 search handle */ + HANDLE handle; + + /* Initial directory name */ + wchar_t *patt; +}; +typedef struct _WDIR _WDIR; + +static _WDIR *_wopendir(const wchar_t *dirname); +static int _wclosedir(_WDIR *dirp); + +/* For compatibility with Symbian */ +#define wdirent _wdirent +#define WDIR _WDIR +#define wopendir _wopendir +#define wclosedir _wclosedir + +/* Multi-byte character versions */ +struct dirent { + /* Always zero */ + long d_ino; + + /* Structure size */ + unsigned short d_reclen; + + /* Length of name without \0 */ + size_t d_namlen; + + /* File type */ + int d_type; + + /* File name */ + char d_name[PATH_MAX]; +}; +typedef struct dirent dirent; + +struct DIR { + struct dirent ent; + struct _WDIR *wdirp; +}; +typedef struct DIR DIR; + +static DIR *opendir(const char *dirname); +static struct dirent *readdir(DIR *dirp); +static int closedir(DIR *dirp); + +/* Internal utility functions */ +static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp); +static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp); + +static int dirent_mbstowcs_s( + size_t *pReturnValue, + wchar_t *wcstr, + size_t sizeInWords, + const char *mbstr, + size_t count); + +static int dirent_wcstombs_s( + size_t *pReturnValue, + char *mbstr, + size_t sizeInBytes, + const wchar_t *wcstr, + size_t count); + +static void dirent_set_errno(int error); + +/* + * Open directory stream DIRNAME for read and return a pointer to the + * internal working area that is used to retrieve individual directory + * entries. + */ +static _WDIR* +_wopendir(const wchar_t *dirname) +{ + _WDIR *dirp = NULL; + int error; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate new _WDIR structure */ + dirp = (_WDIR *)malloc(sizeof(struct _WDIR)); + if (dirp != NULL) { + DWORD n; + + /* Reset _WDIR structure */ + dirp->handle = INVALID_HANDLE_VALUE; + dirp->patt = NULL; + dirp->cached = 0; + + /* Compute the length of full path plus zero terminator + * + * Note that on WinRT there's no way to convert relative paths + * into absolute paths, so just assume its an absolute path. + */ + #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) + n = wcslen(dirname); + #else + n = GetFullPathNameW(dirname, 0, NULL, NULL); + #endif + + /* Allocate room for absolute directory name and search + * pattern + */ + dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16); + if (dirp->patt) { + /* Convert relative directory name to an + * absolute one. This allows rewinddir() to + * function correctly even when current working + * directory is changed between opendir() + * and rewinddir(). + * + * Note that on WinRT there's no way to convert + * relative paths into absolute paths, so just + * assume its an absolute path. + */ + #if defined(WINAPI_FAMILY) && \ + (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP) + wcsncpy_s(dirp->patt, n + 1, dirname, n); + #else + n = GetFullPathNameW(dirname, n, dirp->patt, NULL); + #endif + if (n > 0) { + wchar_t *p; + + /* Append search pattern \* to the directory + * name + */ + p = dirp->patt + n; + if (dirp->patt < p) { + switch (p[-1]) { + case '\\': + case '/': + case ':': + /* Directory ends in path separator, + * e.g.c:\temp\ + */ + /*NOP*/; + break; + + default: + /* Directory name doesn't end in path + * separator + */ + *p++ = '\\'; + } + } + *p++ = '*'; + *p = '\0'; + + /* Open directory stream and retrieve the first + * entry + */ + if (dirent_first(dirp)) { + /* Directory stream opened successfully */ + error = 0; + } else { + /* Cannot retrieve first entry */ + error = 1; + dirent_set_errno(ENOENT); + } + + } else { + /* Cannot retrieve full path name */ + dirent_set_errno(ENOENT); + error = 1; + } + + } else { + /* Cannot allocate memory for search pattern */ + error = 1; + } + + } else { + /* Cannot allocate _WDIR structure */ + error = 1; + } + + /* Clean up in case of error */ + if (error && dirp) { + _wclosedir(dirp); + dirp = NULL; + } + + return dirp; +} + +/* + * Close directory stream opened by opendir() function. + * This invalidates the DIR structure as well as any directory + * entry read previously by _wreaddir(). + */ +static int +_wclosedir(_WDIR *dirp) +{ + int ok; + if (dirp) { + + /* Release search handle */ + if (dirp->handle != INVALID_HANDLE_VALUE) { + FindClose(dirp->handle); + dirp->handle = INVALID_HANDLE_VALUE; + } + + /* Release search pattern */ + if (dirp->patt) { + free(dirp->patt); + dirp->patt = NULL; + } + + /* Release directory structure */ + free(dirp); + ok = /*success*/0; + + } else { + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/-1; + } + return ok; +} + +/* Get first directory entry (internal) */ +static WIN32_FIND_DATAW* +dirent_first(_WDIR *dirp) +{ + WIN32_FIND_DATAW *datap; + + /* Open directory and retrieve the first entry */ + dirp->handle = FindFirstFileExW( + dirp->patt, FindExInfoStandard, &dirp->data, + FindExSearchNameMatch, NULL, 0); + if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* a directory entry is now waiting in memory */ + datap = &dirp->data; + dirp->cached = 1; + + } else { + + /* Failed to re-open directory: no directory entry in memory */ + dirp->cached = 0; + datap = NULL; + + } + return datap; +} + +/* Get next directory entry (internal) */ +static WIN32_FIND_DATAW* +dirent_next(_WDIR *dirp) +{ + WIN32_FIND_DATAW *p; + + /* Get next directory entry */ + if (dirp->cached != 0) { + + /* A valid directory entry already in memory */ + p = &dirp->data; + dirp->cached = 0; + + } else if (dirp->handle != INVALID_HANDLE_VALUE) { + + /* Get the next directory entry from stream */ + if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) { + /* Got a file */ + p = &dirp->data; + } else { + /* The very last entry has been processed + *or an error occurred + */ + FindClose(dirp->handle); + dirp->handle = INVALID_HANDLE_VALUE; + p = NULL; + } + + } else { + + /* End of directory stream reached */ + p = NULL; + + } + + return p; +} + +/* + * Open directory stream using plain old C-string. + */ +static DIR* +opendir(const char *dirname) +{ + struct DIR *dirp; + int error; + + /* Must have directory name */ + if (dirname == NULL || dirname[0] == '\0') { + dirent_set_errno(ENOENT); + return NULL; + } + + /* Allocate memory for DIR structure */ + dirp = (DIR *)malloc(sizeof(struct DIR)); + if (dirp) { + wchar_t wname[PATH_MAX]; + size_t n; + + /* Convert directory name to wide-character string */ + error = dirent_mbstowcs_s(&n, wname, PATH_MAX, + dirname, PATH_MAX); + if (!error) { + + /* Open directory stream using wide-character name */ + dirp->wdirp = _wopendir(wname); + if (dirp->wdirp) { + /* Directory stream opened */ + error = 0; + } else { + /* Failed to open directory stream */ + error = 1; + } + + } else { + /* + * Cannot convert file name to wide-character string. + * This occurs if the string contains invalid multi-byte + * sequences or the output buffer is too small to + * contain the resulting string. + */ + error = 1; + } + + } else { + /* Cannot allocate DIR structure */ + error = 1; + } + + /* Clean up in case of error */ + if (error && dirp) { + free(dirp); + dirp = NULL; + } + + return dirp; +} + +/* + * Read next directory entry. + * + * When working with text consoles, please note that file names + * returned by readdir() are represented in the default ANSI code + * page while any output toconsole is typically formatted on another + * code page. Thus, non-ASCII characters in file names will not usually + * display correctly on console. The problem can be fixed in two ways: + * (1) change the character set of console to 1252 using chcp utility + * and use Lucida Console font, or (2) use _cprintf function when + * writing to console. The _cprinf() will re-encode ANSI strings to the + * console code page so many non-ASCII characters will display correctly. + */ +static struct dirent* +readdir(DIR *dirp) +{ + WIN32_FIND_DATAW *datap; + struct dirent *entp; + + /* Read next directory entry */ + datap = dirent_next(dirp->wdirp); + if (datap) { + size_t n; + int error; + + /* Attempt to convert file name to multi-byte string */ + error = dirent_wcstombs_s(&n, dirp->ent.d_name, + PATH_MAX, datap->cFileName, PATH_MAX); + + /* + * If the file name cannot be represented by a multi-byte + * string, then attempt to use old 8+3 file name. + * This allows traditional Unix-code to access some file + * names despite of unicode characters, although file names + * may seem unfamiliar to the user. + * + * Be ware that the code below cannot come up with a short + * file name unless the file system provides one. At least + * VirtualBox shared folders fail to do this. + */ + if (error && datap->cAlternateFileName[0] != '\0') { + error = dirent_wcstombs_s( + &n, dirp->ent.d_name, PATH_MAX, + datap->cAlternateFileName, PATH_MAX); + } + + if (!error) { + DWORD attr; + + /* Initialize directory entry for return */ + entp = &dirp->ent; + + /* Length of file name excluding zero terminator */ + entp->d_namlen = n - 1; + + /* File attributes */ + attr = datap->dwFileAttributes; + if ((attr & FILE_ATTRIBUTE_DEVICE) != 0) + entp->d_type = DT_CHR; + else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0) + entp->d_type = DT_DIR; + else + entp->d_type = DT_REG; + + /* Reset dummy fields */ + entp->d_ino = 0; + entp->d_reclen = sizeof(struct dirent); + + } else { + /* + * Cannot convert file name to multi-byte string so + * construct an erroneous directory entry and return + * that. Note that we cannot return NULL as that would + * stop the processing of directory entries completely. + */ + entp = &dirp->ent; + entp->d_name[0] = '?'; + entp->d_name[1] = '\0'; + entp->d_namlen = 1; + entp->d_type = DT_UNKNOWN; + entp->d_ino = 0; + entp->d_reclen = 0; + } + + } else { + /* No more directory entries */ + entp = NULL; + } + + return entp; +} + +/* + * Close directory stream. + */ +static int +closedir(DIR *dirp) +{ + int ok; + if (dirp) { + + /* Close wide-character directory stream */ + ok = _wclosedir(dirp->wdirp); + dirp->wdirp = NULL; + + /* Release multi-byte character version */ + free(dirp); + + } else { + + /* Invalid directory stream */ + dirent_set_errno(EBADF); + ok = /*failure*/-1; + + } + return ok; +} + +/* Convert multi-byte string to wide character string */ +static int +dirent_mbstowcs_s( + size_t *pReturnValue, + wchar_t *wcstr, + size_t sizeInWords, + const char *mbstr, + size_t count) +{ + int error; + + #if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 or later */ + error = mbstowcs_s(pReturnValue, wcstr, + sizeInWords, mbstr, count); + #else + + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to wide-character string (or count characters) */ + n = mbstowcs(wcstr, mbstr, sizeInWords); + if (!wcstr || n < count) { + + /* Zero-terminate output buffer */ + if (wcstr && sizeInWords) { + if (n >= sizeInWords) + n = sizeInWords - 1; + wcstr[n] = 0; + } + + /* Length of resuting multi-byte string WITH zero + *terminator + */ + if (pReturnValue) + *pReturnValue = n + 1; + + /* Success */ + error = 0; + + } else { + + /* Could not convert string */ + error = 1; + + } + #endif + + return error; +} + +/* Convert wide-character string to multi-byte string */ +static int +dirent_wcstombs_s( + size_t *pReturnValue, + char *mbstr, + size_t sizeInBytes, /* max size of mbstr */ + const wchar_t *wcstr, + size_t count) +{ + int error; + + #if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 or later */ + error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count); + #else + /* Older Visual Studio or non-Microsoft compiler */ + size_t n; + + /* Convert to multi-byte string + * (or count the number of bytes needed) + */ + n = wcstombs(mbstr, wcstr, sizeInBytes); + if (!mbstr || n < count) { + /* Zero-terminate output buffer */ + if (mbstr && sizeInBytes) { + if (n >= sizeInBytes) + n = sizeInBytes - 1; + mbstr[n] = '\0'; + } + /* Length of resulting multi-bytes string WITH + *zero-terminator + */ + if (pReturnValue) + *pReturnValue = n + 1; + /* Success */ + error = 0; + } else { + /* Cannot convert string */ + error = 1; + } + #endif + + return error; +} + +/* Set errno variable */ +static void +dirent_set_errno(int error) +{ +#if defined(_MSC_VER) && _MSC_VER >= 1400 + /* Microsoft Visual Studio 2005 and later */ + _set_errno(error); +#else + + /* Non-Microsoft compiler or older Microsoft compiler */ + errno = error; +#endif +} + +#ifdef __cplusplus +} +#endif +#endif /*DIRENT_H*/ diff --git a/lib/librte_eal/windows/include/fnmatch.h b/lib/librte_eal/windows/include/fnmatch.h new file mode 100644 index 0000000000..41b574312c --- /dev/null +++ b/lib/librte_eal/windows/include/fnmatch.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _FNMATCH_H_ +#define _FNMATCH_H_ + +/** + * This file is required to support the common code in eal_common_log.c + * as Microsoft libc does not contain fnmatch.h. This may be removed in + * future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#define FNM_NOMATCH 1 + +/** + * This function is used for searhing a given string source + * with the given regular expression pattern. + * + * @param pattern + * regular expression notation decribing the pattern to match + * + * @param string + * source string to searcg for the pattern + * + * @param flag + * containing information about the pattern + * + * @return + * if the pattern is found then return 0 or else FNM_NOMATCH + */ +static inline int fnmatch(__rte_unused const char *pattern, + __rte_unused const char *string, + __rte_unused int flags) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return FNM_NOMATCH; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _FNMATCH_H_ */ diff --git a/lib/librte_eal/windows/include/getopt.h b/lib/librte_eal/windows/include/getopt.h new file mode 100644 index 0000000000..6f57af454b --- /dev/null +++ b/lib/librte_eal/windows/include/getopt.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: BSD-2-Clause + * Copyright (c) 2000 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Dieter Baron and Thomas Klausner. + */ + +/** + * @file + * getopt compat. + * + * This module provides getopt() and getopt_long(). + */ + +#ifndef _USUAL_GETOPT_H_ +#define _USUAL_GETOPT_H_ + +#ifndef NEED_USUAL_GETOPT +#if !defined(HAVE_GETOPT_H) || !defined(HAVE_GETOPT) || \ + !defined(HAVE_GETOPT_LONG) +#define NEED_USUAL_GETOPT +#endif +#endif + +#ifndef NEED_USUAL_GETOPT + +/* Use system getopt */ +#ifdef RTE_TOOLCHAIN_GCC +#include_next +#else +#include +#endif + +#else /* NEED_USUAL_GETOPT */ + +/* avoid name collision */ +#define optarg usual_optarg +#define opterr usual_opterr +#define optind usual_optind +#define optopt usual_optopt +#define getopt(a, b, c) usual_getopt(a, b, c) +#define getopt_long(a, b, c, d, e) usual_getopt_long(a, b, c, d, e) + + +/** argument to current option, or NULL if it has none */ +extern const char *optarg; +/** Current position in arg string. Starts from 1. + * Setting to 0 resets state. + */ +extern int optind; +/** whether getopt() should print error messages on problems. Default: 1. */ +extern int opterr; +/** Option char which caused error */ +extern int optopt; + +/** long option takes no argument */ +#define no_argument 0 +/** long option requires argument */ +#define required_argument 1 +/** long option has optional argument */ +#define optional_argument 2 + +/** Long option description */ +struct option { + /** name of long option */ + const char *name; + + /** + * whether option takes an argument. + * One of no_argument, required_argument, and optional_argument. + */ + int has_arg; + + /** if not NULL, set *flag to val when option found */ + int *flag; + + /** if flag not NULL, value to set *flag to; else return value */ + int val; +}; + +/** Compat: getopt */ +int getopt(int argc, char *argv[], const char *options); + +/** Compat: getopt_long */ +int getopt_long(int argc, char *argv[], const char *options, + const struct option *longopts, int *longindex); + +/** Compat: getopt_long_only */ +int getopt_long_only(int nargc, char *argv[], const char *options, + const struct option *long_options, int *idx); + + +#endif /* NEED_USUAL_GETOPT */ + +#endif /* !_USUAL_GETOPT_H_ */ diff --git a/lib/librte_eal/windows/include/meson.build b/lib/librte_eal/windows/include/meson.build new file mode 100644 index 0000000000..7d18dd52f1 --- /dev/null +++ b/lib/librte_eal/windows/include/meson.build @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright 2020 Mellanox Technologies, Ltd + +includes += include_directories('.') + +headers += files( + 'rte_os.h', +) diff --git a/lib/librte_eal/windows/include/pthread.h b/lib/librte_eal/windows/include/pthread.h new file mode 100644 index 0000000000..b9dd18e568 --- /dev/null +++ b/lib/librte_eal/windows/include/pthread.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _PTHREAD_H_ +#define _PTHREAD_H_ + +/** + * This file is required to support the common code in eal_common_proc.c, + * eal_common_thread.c and common\include\rte_per_lcore.h as Microsoft libc + * does not contain pthread.h. This may be removed in future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#define PTHREAD_BARRIER_SERIAL_THREAD TRUE + +/* defining pthread_t type on Windows since there is no in Microsoft libc*/ +typedef uintptr_t pthread_t; + +/* defining pthread_attr_t type on Windows since there is no in Microsoft libc*/ +typedef void *pthread_attr_t; + +typedef SYNCHRONIZATION_BARRIER pthread_barrier_t; + +#define pthread_barrier_init(barrier, attr, count) \ + InitializeSynchronizationBarrier(barrier, count, -1) +#define pthread_barrier_wait(barrier) EnterSynchronizationBarrier(barrier, \ + SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY) +#define pthread_barrier_destroy(barrier) \ + DeleteSynchronizationBarrier(barrier) +#define pthread_cancel(thread) TerminateThread((HANDLE) thread, 0) + +/* pthread function overrides */ +#define pthread_self() \ + ((pthread_t)GetCurrentThreadId()) +#define pthread_setaffinity_np(thread, size, cpuset) \ + eal_set_thread_affinity_mask(thread, (unsigned long *) cpuset) +#define pthread_getaffinity_np(thread, size, cpuset) \ + eal_get_thread_affinity_mask(thread, (unsigned long *) cpuset) +#define pthread_create(threadid, threadattr, threadfunc, args) \ + eal_create_thread(threadid, threadfunc, args) + +static inline int +eal_set_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) +{ + SetThreadAffinityMask((HANDLE) threadid, *cpuset); + return 0; +} + +static inline int +eal_get_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset) +{ + /* Workaround for the lack of a GetThreadAffinityMask() + *API in Windows + */ + /* obtain previous mask by setting dummy mask */ + DWORD dwprevaffinitymask = + SetThreadAffinityMask((HANDLE) threadid, 0x1); + /* set it back! */ + SetThreadAffinityMask((HANDLE) threadid, dwprevaffinitymask); + *cpuset = dwprevaffinitymask; + return 0; +} + +static inline int +eal_create_thread(void *threadid, void *threadfunc, void *args) +{ + HANDLE hThread; + hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)threadfunc, + args, 0, (LPDWORD)threadid); + if (hThread) { + SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS); + SetThreadPriority(hThread, THREAD_PRIORITY_TIME_CRITICAL); + } + return ((hThread != NULL) ? 0 : E_FAIL); +} + +static inline int +pthread_join(pthread_t thread __attribute__((__unused__)), + void **value_ptr __attribute__((__unused__))) +{ + return 0; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _PTHREAD_H_ */ diff --git a/lib/librte_eal/windows/include/regex.h b/lib/librte_eal/windows/include/regex.h new file mode 100644 index 0000000000..827f938414 --- /dev/null +++ b/lib/librte_eal/windows/include/regex.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _REGEX_H_ +#define _REGEX_H_ + +/** + * This file is required to support the common code in eal_common_log.c + * as Microsoft libc does not contain regex.h. This may be removed in + * future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#define REG_NOMATCH 1 +#define REG_ESPACE 12 + +#include + +/* defining regex_t for Windows */ +typedef void *regex_t; +/* defining regmatch_t for Windows */ +typedef void *regmatch_t; + +/** + * The regcomp() function will compile the regular expression + * contained in the string pointed to by the pattern argument + * and place the results in the structure pointed to by preg. + * The cflags argument is the bitwise inclusive OR of zero or + * more of the flags + */ +static inline int regcomp(__rte_unused regex_t *preg, + __rte_unused const char *regex, __rte_unused int cflags) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return REG_ESPACE; +} + +/** + * The regexec() function compares the null-terminated string + * specified by string with the compiled regular expression + * preg initialised by a previous call to regcomp(). If it finds + * a match, regexec() returns 0; otherwise it returns non-zero + * indicating either no match or an error. The eflags argument + * is the bitwise inclusive OR of zero or more of the flags. + */ +static inline int regexec(__rte_unused const regex_t *preg, + __rte_unused const char *string, __rte_unused size_t nmatch, + __rte_unused regmatch_t pmatch[], __rte_unused int eflags) +{ + /* TODO */ + /* This is a stub, not the expected result */ + return REG_NOMATCH; +} + +/** + * The regerror() function provides a mapping from error codes + * returned by regcomp() and regexec() to unspecified printable strings. + */ +static inline size_t regerror(__rte_unused int errcode, + __rte_unused const regex_t *preg, char *errbuf, + __rte_unused size_t errbuf_size) +{ + /* TODO */ + /* This is a stub, not the expected result */ + if (errbuf) { + *errbuf = '\0'; + return 1; + } + return 0; +} + +/** + * The regfree() function frees any memory allocated by regcomp() + * associated with preg. + */ +static inline void regfree(__rte_unused regex_t *preg) +{ + /* TODO */ + /* This is a stub, not the expected result */ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _REGEX_H_ */ diff --git a/lib/librte_eal/windows/include/rte_os.h b/lib/librte_eal/windows/include/rte_os.h new file mode 100644 index 0000000000..e1e0378e6f --- /dev/null +++ b/lib/librte_eal/windows/include/rte_os.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2019 Intel Corporation + */ + +#ifndef _RTE_OS_H_ +#define _RTE_OS_H_ + +/** + * This is header should contain any function/macro definition + * which are not supported natively or named differently in the + * Windows OS. Functions will be added in future releases. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/* limits.h replacement */ +#include +#ifndef PATH_MAX +#define PATH_MAX _MAX_PATH +#endif + +#define strerror_r(a, b, c) strerror_s(b, c, a) + +/* strdup is deprecated in Microsoft libc and _strdup is preferred */ +#define strdup(str) _strdup(str) + +typedef SSIZE_T ssize_t; + +#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr) + +#define index(a, b) strchr(a, b) +#define rindex(a, b) strrchr(a, b) + +#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count) + +/** + * Create a thread. + * This function is private to EAL. + * + * @param thread + * The location to store the thread id if successful. + * @return + * 0 for success, -1 if the thread is not created. + */ +int eal_thread_create(pthread_t *thread); + +/** + * Create a map of processors and cores on the system. + * This function is private to EAL. + */ +void eal_create_cpu_map(void); + +#ifndef RTE_TOOLCHAIN_GCC +static inline int +asprintf(char **buffer, const char *format, ...) +{ + int size, ret; + va_list arg; + + va_start(arg, format); + size = vsnprintf(NULL, 0, format, arg); + va_end(arg); + if (size < 0) + return -1; + size++; + + *buffer = malloc(size); + if (*buffer == NULL) + return -1; + + va_start(arg, format); + ret = vsnprintf(*buffer, size, format, arg); + va_end(arg); + if (ret != size - 1) { + free(*buffer); + return -1; + } + return ret; +} +#endif /* RTE_TOOLCHAIN_GCC */ + +/* cpu_set macros implementation */ +#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2) +#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2) +#define RTE_CPU_FILL(set) CPU_FILL(set) +#define RTE_CPU_NOT(dst, src) CPU_NOT(dst, src) + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_OS_H_ */ diff --git a/lib/librte_eal/windows/include/sched.h b/lib/librte_eal/windows/include/sched.h new file mode 100644 index 0000000000..fbe07f742c --- /dev/null +++ b/lib/librte_eal/windows/include/sched.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _SCHED_H_ +#define _SCHED_H_ + +/** + * This file is added to support the common code in eal_common_thread.c + * as Microsoft libc does not contain sched.h. This may be removed + * in future releases. + */ +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef CPU_SETSIZE +#define CPU_SETSIZE RTE_MAX_LCORE +#endif + +#define _BITS_PER_SET (sizeof(long long) * 8) +#define _BIT_SET_MASK (_BITS_PER_SET - 1) + +#define _NUM_SETS(b) (((b) + _BIT_SET_MASK) / _BITS_PER_SET) +#define _WHICH_SET(b) ((b) / _BITS_PER_SET) +#define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1)) + +typedef struct _rte_cpuset_s { + long long _bits[_NUM_SETS(CPU_SETSIZE)]; +} rte_cpuset_t; + +#define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b))) + +#define CPU_ZERO(s) \ + do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (s)->_bits[_i] = 0LL; \ + } while (0) + +#define CPU_ISSET(b, s) (((s)->_bits[_WHICH_SET(b)] & \ + (1LL << _WHICH_BIT(b))) != 0LL) + +static inline int +count_cpu(rte_cpuset_t *s) +{ + unsigned int _i; + int count = 0; + + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) + if (CPU_ISSET(_i, s) != 0LL) + count++; + return count; +} +#define CPU_COUNT(s) count_cpu(s) + +#define CPU_AND(dst, src1, src2) \ +do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src1)->_bits[_i] & (src2)->_bits[_i]; \ +} while (0) + +#define CPU_OR(dst, src1, src2) \ +do { \ + unsigned int _i; \ + \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src1)->_bits[_i] | (src2)->_bits[_i]; \ +} while (0) + +#define CPU_FILL(s) \ +do { \ + unsigned int _i; \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (s)->_bits[_i] = -1LL; \ +} while (0) + +#define CPU_NOT(dst, src) \ +do { \ + unsigned int _i; \ + for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (dst)->_bits[_i] = (src)->_bits[_i] ^ -1LL; \ +} while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SCHED_H_ */ diff --git a/lib/librte_eal/windows/include/sys/queue.h b/lib/librte_eal/windows/include/sys/queue.h new file mode 100644 index 0000000000..a65949a78a --- /dev/null +++ b/lib/librte_eal/windows/include/sys/queue.h @@ -0,0 +1,302 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +/* + * This file defines tail queues. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * Below is a summary of implemented functions where: + * + means the macro is available + * - means the macro is not available + * s means the macro is available but is slow (runs in O(n) time) + * + * TAILQ + * _HEAD + + * _CLASS_HEAD + + * _HEAD_INITIALIZER + + * _ENTRY + + * _CLASS_ENTRY + + * _INIT + + * _EMPTY + + * _FIRST + + * _NEXT + + * _PREV + + * _LAST + + * _LAST_FAST + + * _FOREACH + + * _FOREACH_FROM + + * _FOREACH_SAFE + + * _FOREACH_FROM_SAFE + + * _FOREACH_REVERSE + + * _FOREACH_REVERSE_FROM + + * _FOREACH_REVERSE_SAFE + + * _FOREACH_REVERSE_FROM_SAFE + + * _INSERT_HEAD + + * _INSERT_BEFORE + + * _INSERT_AFTER + + * _INSERT_TAIL + + * _CONCAT + + * _REMOVE_AFTER - + * _REMOVE_HEAD - + * _REMOVE + + * _SWAP + + * + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * List definitions. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define TRACEBUF +#define TRACEBUF_INITIALIZER + +#define TRASHIT(x) +#define QMD_IS_TRASHED(x) 0 + +#define QMD_SAVELINK(name, link) + +#ifdef __cplusplus +/* + * In C++ there can be structure lists and class lists: + */ +#define QUEUE_TYPEOF(type) type +#else +#define QUEUE_TYPEOF(type) struct type +#endif + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *tqh_first; /* first element */ \ + class type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *tqe_next; /* next element */ \ + class type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field); \ + if (TAILQ_NEXT((listelm), field) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + TAILQ_NEXT((elm), field) = TAILQ_FIRST((head)); \ + if (TAILQ_FIRST((head)) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +/* + * The FAST function is fast in that it causes no data access other + * then the access to the head. The standard LAST function above + * will cause a data access of both the element you want and + * the previous element. FAST is very useful for instances when + * you may want to prefetch the last data element. + */ +#define TAILQ_LAST_FAST(head, type, field) \ + (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, \ + QUEUE_TYPEOF(type), field.tqe_next)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ + QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) * swap_first = (head1)->tqh_first; \ + QUEUE_TYPEOF(type) * *swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + swap_first = (head1)->tqh_first; \ + if (swap_first != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + swap_first = (head2)->tqh_first; \ + if (swap_first != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_QUEUE_H_ */ diff --git a/lib/librte_eal/windows/include/unistd.h b/lib/librte_eal/windows/include/unistd.h new file mode 100644 index 0000000000..757b7f3c57 --- /dev/null +++ b/lib/librte_eal/windows/include/unistd.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2019 Intel Corporation + */ + +#ifndef _UNISTD_H_ +#define _UNISTD_H_ +/** + * This file is added to support common code in eal_common_lcore.c + * as Microsoft libc does not contain unistd.h. This may be removed + * in future releases. + */ +#endif /* _UNISTD_H_ */ diff --git a/lib/librte_eal/windows/meson.build b/lib/librte_eal/windows/meson.build new file mode 100644 index 0000000000..09dd4ab2fa --- /dev/null +++ b/lib/librte_eal/windows/meson.build @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright(c) 2019 Intel Corporation + +subdir('include') + +sources += files( + 'eal.c', + 'eal_debug.c', + 'eal_lcore.c', + 'eal_thread.c', + 'getopt.c', +) diff --git a/license/exceptions.txt b/license/exceptions.txt index 1e21a863f1..c984764a02 100644 --- a/license/exceptions.txt +++ b/license/exceptions.txt @@ -12,9 +12,9 @@ Note that following licenses are not exceptions:- --------------------------------------------------------------------------------------------------- SPDX Identifier TB Approval Date GB Approval Date File name --------------------------------------------------------------------------------------------------- -1.MIT 10/23/2019 02/10/2020 lib/librte_eal/windows/eal/include/dirent.h -2.BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/eal/include/getopt.h +1.MIT 10/23/2019 02/10/2020 lib/librte_eal/windows/include/dirent.h +2.BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/include/getopt.h 3.ISC AND - BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/eal/getopt.c + BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/getopt.c 4.GPL-2.0 09/25/2019 12/18/2019 buildtools/pmdinfogen/pmdinfogen.* --------------------------------------------------------------------------------------------------- diff --git a/meson.build b/meson.build index ace4a0b8bf..d36580438e 100644 --- a/meson.build +++ b/meson.build @@ -29,7 +29,7 @@ abi_version_file = files('ABI_VERSION') # for passing to pmdinfogen scripts global_inc = include_directories('.', 'config', 'lib/librte_eal/include', - 'lib/librte_eal/@0@/eal/include'.format(host_machine.system()), + 'lib/librte_eal/@0@/include'.format(host_machine.system()), ) subdir('config') diff --git a/mk/exec-env/freebsd/rte.vars.mk b/mk/exec-env/freebsd/rte.vars.mk index 3608530d3f..630eb55f73 100644 --- a/mk/exec-env/freebsd/rte.vars.mk +++ b/mk/exec-env/freebsd/rte.vars.mk @@ -18,7 +18,7 @@ EXECENV_CFLAGS = -pthread endif # include in every library to build -EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/freebsd/eal/include +EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/freebsd/include EXECENV_LDFLAGS = EXECENV_LDLIBS = -lexecinfo diff --git a/mk/exec-env/linux/rte.vars.mk b/mk/exec-env/linux/rte.vars.mk index bea3f76577..41ef4195b0 100644 --- a/mk/exec-env/linux/rte.vars.mk +++ b/mk/exec-env/linux/rte.vars.mk @@ -18,7 +18,7 @@ EXECENV_CFLAGS = -pthread endif # include in every library to build -EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include +EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/include EXECENV_LDLIBS = EXECENV_ASFLAGS =