F: lib/librte_eal/common/eal_common_fbarray.c
F: lib/librte_eal/common/eal_common_mem*
F: lib/librte_eal/common/eal_hugepages.h
-F: lib/librte_eal/linux/eal/eal_mem*
-F: lib/librte_eal/freebsd/eal/eal_mem*
+F: lib/librte_eal/linux/eal_mem*
+F: lib/librte_eal/freebsd/eal_mem*
F: doc/guides/prog_guide/env_abstraction_layer.rst
F: app/test/test_external_mem.c
F: app/test/test_func_reentrancy.c
F: lib/librte_eal/x86/
Linux EAL (with overlaps)
-F: lib/librte_eal/linux/Makefile
-F: lib/librte_eal/linux/eal/
+F: lib/librte_eal/linux/
F: doc/guides/linux_gsg/
Linux UIO
Linux VFIO
M: Anatoly Burakov <anatoly.burakov@intel.com>
-F: lib/librte_eal/linux/eal/*vfio*
+F: lib/librte_eal/linux/*vfio*
F: drivers/bus/pci/linux/*vfio*
FreeBSD EAL (with overlaps)
M: Bruce Richardson <bruce.richardson@intel.com>
-F: lib/librte_eal/freebsd/Makefile
-F: lib/librte_eal/freebsd/eal/
+F: lib/librte_eal/freebsd/
F: doc/guides/freebsd_gsg/
FreeBSD contigmem
'src=' + meson.current_source_dir(),
'MODULE_CFLAGS=-include ' + meson.source_root() + '/config/rte_config.h' +
' -I' + meson.source_root() + '/lib/librte_eal/include' +
- ' -I' + meson.source_root() + '/lib/librte_eal/linux/eal/include' +
+ ' -I' + meson.source_root() + '/lib/librte_eal/linux/include' +
' -I' + meson.build_root() +
' -I' + meson.current_source_dir(),
'modules'],
include $(RTE_SDK)/mk/rte.vars.mk
DIRS-y += include
-DIRS-$(CONFIG_RTE_EXEC_ENV_LINUX) += linux/eal
+DIRS-$(CONFIG_RTE_EXEC_ENV_LINUX) += linux
DEPDIRS-linux := include
-DIRS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += freebsd/eal
+DIRS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += freebsd
DEPDIRS-freebsd := include
include $(RTE_SDK)/mk/rte.subdir.mk
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2019 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+LIB = librte_eal.a
+
+ARCH_DIR ?= $(RTE_ARCH)
+VPATH += $(RTE_SDK)/lib/librte_eal/common
+VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR)
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -I$(SRCDIR)/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include
+CFLAGS += $(WERROR_FLAGS) -O3
+
+LDLIBS += -lexecinfo
+LDLIBS += -lpthread
+LDLIBS += -lgcc_s
+LDLIBS += -lrte_kvargs
+
+EXPORT_MAP := ../rte_eal_version.map
+
+# specific to freebsd exec-env
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) := eal.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_hugepage_info.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_interrupts.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_alarm.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_dev.c
+
+# from common dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memzone.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_mcfg.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_tailqs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_errno.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hypervisor.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_string_fns.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hexdump.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_devargs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_class.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_bus.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_dev.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_options.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_fbarray.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_uuid.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += hotplug_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_elem.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_option.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_service.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_random.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
+
+# from arch dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_hypervisor.c
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
+SRCS-y += rte_cycles.c
+
+CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
+
+# workaround for a gcc bug with noreturn attribute
+# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
+ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
+CFLAGS_eal_thread.o += -Wno-return-type
+CFLAGS_eal_hpet.o += -Wno-return-type
+endif
+
+INC := rte_os.h
+
+SYMLINK-$(CONFIG_RTE_EXEC_ENV_FREEBSD)-include := $(addprefix include/,$(INC))
+
+include $(RTE_SDK)/mk/rte.lib.mk
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation.
+ * Copyright(c) 2014 6WIND S.A.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <syslog.h>
+#include <getopt.h>
+#include <sys/file.h>
+#include <stddef.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <rte_compat.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_service_component.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_interrupts.h>
+#include <rte_bus.h>
+#include <rte_dev.h>
+#include <rte_devargs.h>
+#include <rte_version.h>
+#include <rte_vfio.h>
+#include <rte_option.h>
+#include <rte_atomic.h>
+#include <malloc_heap.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+#include "eal_options.h"
+#include "eal_memcfg.h"
+
+#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
+
+/* Allow the application to print its usage message too if set */
+static rte_usage_hook_t rte_application_usage_hook = NULL;
+/* early configuration structure, when memory config is not mmapped */
+static struct rte_mem_config early_mem_config;
+
+/* define fd variable here, because file needs to be kept open for the
+ * duration of the program, as we hold a write lock on it in the primary proc */
+static int mem_cfg_fd = -1;
+
+static struct flock wr_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
+};
+
+/* Address of global and public configuration */
+static struct rte_config rte_config = {
+ .mem_config = &early_mem_config,
+};
+
+/* internal configuration (per-core) */
+struct lcore_config lcore_config[RTE_MAX_LCORE];
+
+/* internal configuration */
+struct internal_config internal_config;
+
+/* used by rte_rdtsc() */
+int rte_cycles_vmware_tsc_map;
+
+/* platform-specific runtime dir */
+static char runtime_dir[PATH_MAX];
+
+static const char *default_runtime_dir = "/var/run";
+
+int
+eal_create_runtime_dir(void)
+{
+ const char *directory = default_runtime_dir;
+ const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
+ const char *fallback = "/tmp";
+ char tmp[PATH_MAX];
+ int ret;
+
+ if (getuid() != 0) {
+ /* try XDG path first, fall back to /tmp */
+ if (xdg_runtime_dir != NULL)
+ directory = xdg_runtime_dir;
+ else
+ directory = fallback;
+ }
+ /* create DPDK subdirectory under runtime dir */
+ ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
+ if (ret < 0 || ret == sizeof(tmp)) {
+ RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
+ return -1;
+ }
+
+ /* create prefix-specific subdirectory under DPDK runtime dir */
+ ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
+ tmp, eal_get_hugefile_prefix());
+ if (ret < 0 || ret == sizeof(runtime_dir)) {
+ RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
+ return -1;
+ }
+
+ /* create the path if it doesn't exist. no "mkdir -p" here, so do it
+ * step by step.
+ */
+ ret = mkdir(tmp, 0700);
+ if (ret < 0 && errno != EEXIST) {
+ RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+ tmp, strerror(errno));
+ return -1;
+ }
+
+ ret = mkdir(runtime_dir, 0700);
+ if (ret < 0 && errno != EEXIST) {
+ RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+ runtime_dir, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+eal_clean_runtime_dir(void)
+{
+ /* FreeBSD doesn't need this implemented for now, because, unlike Linux,
+ * FreeBSD doesn't create per-process files, so no need to clean up.
+ */
+ return 0;
+}
+
+
+const char *
+rte_eal_get_runtime_dir(void)
+{
+ return runtime_dir;
+}
+
+/* Return user provided mbuf pool ops name */
+const char *
+rte_eal_mbuf_user_pool_ops(void)
+{
+ return internal_config.user_mbuf_pool_ops_name;
+}
+
+/* Return a pointer to the configuration structure */
+struct rte_config *
+rte_eal_get_configuration(void)
+{
+ return &rte_config;
+}
+
+enum rte_iova_mode
+rte_eal_iova_mode(void)
+{
+ return rte_eal_get_configuration()->iova_mode;
+}
+
+/* parse a sysfs (or other) file containing one integer value */
+int
+eal_parse_sysfs_value(const char *filename, unsigned long *val)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ char *end = NULL;
+
+ if ((f = fopen(filename, "r")) == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
+ __func__, filename);
+ return -1;
+ }
+
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
+ __func__, filename);
+ fclose(f);
+ return -1;
+ }
+ *val = strtoul(buf, &end, 0);
+ if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
+ __func__, filename);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+ return 0;
+}
+
+
+/* create memory configuration in shared/mmap memory. Take out
+ * a write lock on the memsegs, so we can auto-detect primary/secondary.
+ * This means we never close the file while running (auto-close on exit).
+ * We also don't lock the whole file, so that in future we can use read-locks
+ * on other parts, e.g. memzones, to detect if there are running secondary
+ * processes. */
+static int
+rte_eal_config_create(void)
+{
+ size_t page_sz = sysconf(_SC_PAGE_SIZE);
+ size_t cfg_len = sizeof(*rte_config.mem_config);
+ size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
+ void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
+ int retval;
+
+ const char *pathname = eal_runtime_config_path();
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ /* map the config before base address so that we don't waste a page */
+ if (internal_config.base_virtaddr != 0)
+ rte_mem_cfg_addr = (void *)
+ RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
+ sizeof(struct rte_mem_config), page_sz);
+ else
+ rte_mem_cfg_addr = NULL;
+
+ if (mem_cfg_fd < 0){
+ mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
+ if (mem_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+ }
+
+ retval = ftruncate(mem_cfg_fd, cfg_len);
+ if (retval < 0){
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+
+ retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
+ if (retval < 0){
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
+ "process running?\n", pathname);
+ return -1;
+ }
+
+ /* reserve space for config */
+ rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
+ &cfg_len_aligned, page_sz, 0, 0);
+ if (rte_mem_cfg_addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ return -1;
+ }
+
+ /* remap the actual file into the space we've just reserved */
+ mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
+ cfg_len_aligned, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
+ if (mapped_mem_cfg_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
+ munmap(rte_mem_cfg_addr, cfg_len);
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ return -1;
+ }
+
+ memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
+ rte_config.mem_config = rte_mem_cfg_addr;
+
+ /* store address of the config in the config itself so that secondary
+ * processes could later map the config into this exact location
+ */
+ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+
+ return 0;
+}
+
+/* attach to an existing shared memory config */
+static int
+rte_eal_config_attach(void)
+{
+ void *rte_mem_cfg_addr;
+ const char *pathname = eal_runtime_config_path();
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ if (mem_cfg_fd < 0){
+ mem_cfg_fd = open(pathname, O_RDWR);
+ if (mem_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+ }
+
+ rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
+ PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
+ /* don't close the fd here, it will be closed on reattach */
+ if (rte_mem_cfg_addr == MAP_FAILED) {
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ rte_config.mem_config = rte_mem_cfg_addr;
+
+ return 0;
+}
+
+/* reattach the shared config at exact memory location primary process has it */
+static int
+rte_eal_config_reattach(void)
+{
+ struct rte_mem_config *mem_config;
+ void *rte_mem_cfg_addr;
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ /* save the address primary process has mapped shared config to */
+ rte_mem_cfg_addr =
+ (void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr;
+
+ /* unmap original config */
+ munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
+
+ /* remap the config at proper address */
+ mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
+ sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_cfg_fd, 0);
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+
+ if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
+ if (mem_config != MAP_FAILED) {
+ /* errno is stale, don't use */
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]"
+ " - please use '--" OPT_BASE_VIRTADDR
+ "' option\n",
+ rte_mem_cfg_addr, mem_config);
+ munmap(mem_config, sizeof(struct rte_mem_config));
+ return -1;
+ }
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ rte_config.mem_config = mem_config;
+
+ return 0;
+}
+
+/* Detect if we are a primary or a secondary process */
+enum rte_proc_type_t
+eal_proc_type_detect(void)
+{
+ enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
+ const char *pathname = eal_runtime_config_path();
+
+ /* if there no shared config, there can be no secondary processes */
+ if (!internal_config.no_shconf) {
+ /* if we can open the file but not get a write-lock we are a
+ * secondary process. NOTE: if we get a file handle back, we
+ * keep that open and don't close it to prevent a race condition
+ * between multiple opens.
+ */
+ if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
+ (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
+ ptype = RTE_PROC_SECONDARY;
+ }
+
+ RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
+ ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
+
+ return ptype;
+}
+
+/* Sets up rte_config structure with the pointer to shared memory config.*/
+static int
+rte_config_init(void)
+{
+ rte_config.process_type = internal_config.process_type;
+
+ switch (rte_config.process_type){
+ case RTE_PROC_PRIMARY:
+ if (rte_eal_config_create() < 0)
+ return -1;
+ eal_mcfg_update_from_internal();
+ break;
+ case RTE_PROC_SECONDARY:
+ if (rte_eal_config_attach() < 0)
+ return -1;
+ eal_mcfg_wait_complete();
+ if (eal_mcfg_check_version() < 0) {
+ RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
+ return -1;
+ }
+ if (rte_eal_config_reattach() < 0)
+ return -1;
+ eal_mcfg_update_internal();
+ break;
+ case RTE_PROC_AUTO:
+ case RTE_PROC_INVALID:
+ RTE_LOG(ERR, EAL, "Invalid process type %d\n",
+ rte_config.process_type);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* display usage */
+static void
+eal_usage(const char *prgname)
+{
+ printf("\nUsage: %s ", prgname);
+ eal_common_usage();
+ /* Allow the application to print its usage message too if hook is set */
+ if ( rte_application_usage_hook ) {
+ printf("===== Application Usage =====\n\n");
+ rte_application_usage_hook(prgname);
+ }
+}
+
+/* Set a per-application usage message */
+rte_usage_hook_t
+rte_set_application_usage_hook( rte_usage_hook_t usage_func )
+{
+ rte_usage_hook_t old_func;
+
+ /* Will be NULL on the first call to denote the last usage routine. */
+ old_func = rte_application_usage_hook;
+ rte_application_usage_hook = usage_func;
+
+ return old_func;
+}
+
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+ uint64_t size = 0;
+ unsigned i, j;
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ size += hpi->hugepage_sz * hpi->num_pages[j];
+ }
+ }
+ }
+
+ return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
+}
+
+/* Parse the arguments for --log-level only */
+static void
+eal_log_level_parse(int argc, char **argv)
+{
+ int opt;
+ char **argvopt;
+ int option_index;
+ const int old_optind = optind;
+ const int old_optopt = optopt;
+ const int old_optreset = optreset;
+ char * const old_optarg = optarg;
+
+ argvopt = argv;
+ optind = 1;
+ optreset = 1;
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ int ret;
+
+ /* getopt is not happy, stop right now */
+ if (opt == '?')
+ break;
+
+ ret = (opt == OPT_LOG_LEVEL_NUM) ?
+ eal_parse_common_option(opt, optarg, &internal_config) : 0;
+
+ /* common parser is not happy */
+ if (ret < 0)
+ break;
+ }
+
+ /* restore getopt lib */
+ optind = old_optind;
+ optopt = old_optopt;
+ optreset = old_optreset;
+ optarg = old_optarg;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+eal_parse_args(int argc, char **argv)
+{
+ int opt, ret;
+ char **argvopt;
+ int option_index;
+ char *prgname = argv[0];
+ const int old_optind = optind;
+ const int old_optopt = optopt;
+ const int old_optreset = optreset;
+ char * const old_optarg = optarg;
+
+ argvopt = argv;
+ optind = 1;
+ optreset = 1;
+ opterr = 0;
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
+ if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = eal_parse_common_option(opt, optarg, &internal_config);
+ /* common parser is not happy */
+ if (ret < 0) {
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ /* common parser handled this option */
+ if (ret == 0)
+ continue;
+
+ switch (opt) {
+ case OPT_MBUF_POOL_OPS_NAME_NUM:
+ {
+ char *ops_name = strdup(optarg);
+ if (ops_name == NULL)
+ RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
+ else {
+ /* free old ops name */
+ if (internal_config.user_mbuf_pool_ops_name !=
+ NULL)
+ free(internal_config.user_mbuf_pool_ops_name);
+
+ internal_config.user_mbuf_pool_ops_name =
+ ops_name;
+ }
+ break;
+ }
+ case 'h':
+ eal_usage(prgname);
+ exit(EXIT_SUCCESS);
+ default:
+ if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
+ RTE_LOG(ERR, EAL, "Option %c is not supported "
+ "on FreeBSD\n", opt);
+ } else if (opt >= OPT_LONG_MIN_NUM &&
+ opt < OPT_LONG_MAX_NUM) {
+ RTE_LOG(ERR, EAL, "Option %s is not supported "
+ "on FreeBSD\n",
+ eal_long_options[option_index].name);
+ } else {
+ RTE_LOG(ERR, EAL, "Option %d is not supported "
+ "on FreeBSD\n", opt);
+ }
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* create runtime data directory */
+ if (internal_config.no_shconf == 0 &&
+ eal_create_runtime_dir() < 0) {
+ RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (eal_adjust_config(&internal_config) != 0) {
+ ret = -1;
+ goto out;
+ }
+
+ /* sanity checks */
+ if (eal_check_common_options(&internal_config) != 0) {
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+
+ if (optind >= 0)
+ argv[optind-1] = prgname;
+ ret = optind-1;
+
+out:
+ /* restore getopt lib */
+ optind = old_optind;
+ optopt = old_optopt;
+ optreset = old_optreset;
+ optarg = old_optarg;
+
+ return ret;
+}
+
+static int
+check_socket(const struct rte_memseg_list *msl, void *arg)
+{
+ int *socket_id = arg;
+
+ if (msl->external)
+ return 0;
+
+ if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
+ return 1;
+
+ return 0;
+}
+
+static void
+eal_check_mem_on_local_socket(void)
+{
+ int socket_id;
+
+ socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
+
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
+ RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
+}
+
+
+static int
+sync_func(__attribute__((unused)) void *arg)
+{
+ return 0;
+}
+
+/* return non-zero if hugepages are enabled. */
+int rte_eal_has_hugepages(void)
+{
+ return !internal_config.no_hugetlbfs;
+}
+
+/* Abstraction for port I/0 privilege */
+int
+rte_eal_iopl_init(void)
+{
+ static int fd = -1;
+
+ if (fd < 0)
+ fd = open("/dev/io", O_RDWR);
+
+ if (fd < 0)
+ return -1;
+ /* keep fd open for iopl */
+ return 0;
+}
+
+static void rte_eal_init_alert(const char *msg)
+{
+ fprintf(stderr, "EAL: FATAL: %s\n", msg);
+ RTE_LOG(ERR, EAL, "%s\n", msg);
+}
+
+/* Launch threads, called at application init(). */
+int
+rte_eal_init(int argc, char **argv)
+{
+ int i, fctret, ret;
+ pthread_t thread_id;
+ static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
+ char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+ /* checks if the machine is adequate */
+ if (!rte_cpu_is_supported()) {
+ rte_eal_init_alert("unsupported cpu type.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ if (!rte_atomic32_test_and_set(&run_once)) {
+ rte_eal_init_alert("already called initialization.");
+ rte_errno = EALREADY;
+ return -1;
+ }
+
+ thread_id = pthread_self();
+
+ eal_reset_internal_config(&internal_config);
+
+ /* set log level as early as possible */
+ eal_log_level_parse(argc, argv);
+
+ if (rte_eal_cpu_init() < 0) {
+ rte_eal_init_alert("Cannot detect lcores.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ fctret = eal_parse_args(argc, argv);
+ if (fctret < 0) {
+ rte_eal_init_alert("Invalid 'command line' arguments.");
+ rte_errno = EINVAL;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ /* FreeBSD always uses legacy memory model */
+ internal_config.legacy_mem = true;
+
+ if (eal_plugins_init() < 0) {
+ rte_eal_init_alert("Cannot init plugins");
+ rte_errno = EINVAL;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ if (eal_option_device_parse()) {
+ rte_errno = ENODEV;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ if (rte_config_init() < 0) {
+ rte_eal_init_alert("Cannot init config");
+ return -1;
+ }
+
+ if (rte_eal_intr_init() < 0) {
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
+ return -1;
+ }
+
+ if (rte_eal_alarm_init() < 0) {
+ rte_eal_init_alert("Cannot init alarm");
+ /* rte_eal_alarm_init sets rte_errno on failure. */
+ return -1;
+ }
+
+ /* Put mp channel init before bus scan so that we can init the vdev
+ * bus through mp channel in the secondary process before the bus scan.
+ */
+ if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
+ rte_eal_init_alert("failed to init mp channel");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
+ if (rte_bus_scan()) {
+ rte_eal_init_alert("Cannot scan the buses for devices");
+ rte_errno = ENODEV;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+ enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
+
+ if (iova_mode == RTE_IOVA_DC)
+ iova_mode = RTE_IOVA_PA;
+ rte_eal_get_configuration()->iova_mode = iova_mode;
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
+ }
+
+ RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
+ rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
+
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+ }
+
+ if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
+ if (internal_config.no_hugetlbfs)
+ internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
+ else
+ internal_config.memory = eal_get_hugepage_mem_size();
+ }
+
+ if (internal_config.vmware_tsc_map == 1) {
+#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
+ rte_cycles_vmware_tsc_map = 1;
+ RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
+ "you must have monitor_control.pseudo_perfctr = TRUE\n");
+#else
+ RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
+ "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
+#endif
+ }
+
+ /* in secondary processes, memory init may allocate additional fbarrays
+ * not present in primary processes, so to avoid any potential issues,
+ * initialize memzones first.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_memory_init() < 0) {
+ rte_eal_init_alert("Cannot init memory");
+ rte_errno = ENOMEM;
+ return -1;
+ }
+
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_tailqs_init() < 0) {
+ rte_eal_init_alert("Cannot init tail queues for objects");
+ rte_errno = EFAULT;
+ return -1;
+ }
+
+ if (rte_eal_timer_init() < 0) {
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ eal_check_mem_on_local_socket();
+
+ eal_thread_init_master(rte_config.master_lcore);
+
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+ RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
+ rte_config.master_lcore, thread_id, cpuset,
+ ret == 0 ? "" : "...");
+
+ RTE_LCORE_FOREACH_SLAVE(i) {
+
+ /*
+ * create communication pipes between master thread
+ * and children
+ */
+ if (pipe(lcore_config[i].pipe_master2slave) < 0)
+ rte_panic("Cannot create pipe\n");
+ if (pipe(lcore_config[i].pipe_slave2master) < 0)
+ rte_panic("Cannot create pipe\n");
+
+ lcore_config[i].state = WAIT;
+
+ /* create a thread for each lcore */
+ ret = pthread_create(&lcore_config[i].thread_id, NULL,
+ eal_thread_loop, NULL);
+ if (ret != 0)
+ rte_panic("Cannot create thread\n");
+
+ /* Set thread_name for aid in debugging. */
+ snprintf(thread_name, sizeof(thread_name),
+ "lcore-slave-%d", i);
+ rte_thread_setname(lcore_config[i].thread_id, thread_name);
+ }
+
+ /*
+ * Launch a dummy function on all slave lcores, so that master lcore
+ * knows they are all ready when this function returns.
+ */
+ rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
+ rte_eal_mp_wait_lcore();
+
+ /* initialize services so vdevs register service during bus_probe. */
+ ret = rte_service_init();
+ if (ret) {
+ rte_eal_init_alert("rte_service_init() failed");
+ rte_errno = ENOEXEC;
+ return -1;
+ }
+
+ /* Probe all the buses and devices/drivers on them */
+ if (rte_bus_probe()) {
+ rte_eal_init_alert("Cannot probe devices");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ /* initialize default service/lcore mappings and start running. Ignore
+ * -ENOTSUP, as it indicates no service coremask passed to EAL.
+ */
+ ret = rte_service_start_with_defaults();
+ if (ret < 0 && ret != -ENOTSUP) {
+ rte_errno = ENOEXEC;
+ return -1;
+ }
+
+ /*
+ * Clean up unused files in runtime directory. We do this at the end of
+ * init and not at the beginning because we want to clean stuff up
+ * whether we are primary or secondary process, but we cannot remove
+ * primary process' files because secondary should be able to run even
+ * if primary process is dead.
+ *
+ * In no_shconf mode, no runtime directory is created in the first
+ * place, so no cleanup needed.
+ */
+ if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
+ rte_eal_init_alert("Cannot clear runtime directory\n");
+ return -1;
+ }
+
+ eal_mcfg_complete();
+
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
+ return fctret;
+}
+
+int
+rte_eal_cleanup(void)
+{
+ rte_service_finalize();
+ rte_mp_channel_cleanup();
+ eal_cleanup_config(&internal_config);
+ return 0;
+}
+
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+ return rte_config.process_type;
+}
+
+int rte_eal_has_pci(void)
+{
+ return !internal_config.no_pci;
+}
+
+int rte_eal_create_uio_dev(void)
+{
+ return internal_config.create_uio_dev;
+}
+
+enum rte_intr_mode
+rte_eal_vfio_intr_mode(void)
+{
+ return RTE_INTR_MODE_NONE;
+}
+
+int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *vfio_dev_fd,
+ __rte_unused struct vfio_device_info *device_info)
+{
+ return -1;
+}
+
+int rte_vfio_release_device(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int fd)
+{
+ return -1;
+}
+
+int rte_vfio_enable(__rte_unused const char *modname)
+{
+ return -1;
+}
+
+int rte_vfio_is_enabled(__rte_unused const char *modname)
+{
+ return 0;
+}
+
+int rte_vfio_noiommu_is_enabled(void)
+{
+ return 0;
+}
+
+int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
+{
+ return 0;
+}
+
+int
+rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_get_container_fd(void)
+{
+ return -1;
+}
+
+int
+rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_create(void)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+ __rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+ __rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+ __rte_unused uint64_t vaddr,
+ __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+ __rte_unused uint64_t vaddr,
+ __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+++ /dev/null
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2019 Intel Corporation
-
-include $(RTE_SDK)/mk/rte.vars.mk
-
-LIB = librte_eal.a
-
-ARCH_DIR ?= $(RTE_ARCH)
-VPATH += $(RTE_SDK)/lib/librte_eal/common
-VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR)
-
-CFLAGS += -DALLOW_EXPERIMENTAL_API
-CFLAGS += -I$(SRCDIR)/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include
-CFLAGS += $(WERROR_FLAGS) -O3
-
-LDLIBS += -lexecinfo
-LDLIBS += -lpthread
-LDLIBS += -lgcc_s
-LDLIBS += -lrte_kvargs
-
-EXPORT_MAP := ../../rte_eal_version.map
-
-# specific to freebsd exec-env
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) := eal.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_hugepage_info.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_debug.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_interrupts.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_alarm.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_dev.c
-
-# from common dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memzone.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_log.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_launch.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_mcfg.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_tailqs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_errno.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hypervisor.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_string_fns.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_hexdump.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_devargs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_class.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_bus.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_dev.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_options.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_proc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_fbarray.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += eal_common_uuid.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_malloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += hotplug_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_elem.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_heap.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += malloc_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_keepalive.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_option.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_service.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_random.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_reciprocal.c
-
-# from arch dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_FREEBSD) += rte_hypervisor.c
-SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
-SRCS-y += rte_cycles.c
-
-CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-
-# workaround for a gcc bug with noreturn attribute
-# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
-ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
-CFLAGS_eal_thread.o += -Wno-return-type
-CFLAGS_eal_hpet.o += -Wno-return-type
-endif
-
-INC := rte_os.h
-
-SYMLINK-$(CONFIG_RTE_EXEC_ENV_FREEBSD)-include := $(addprefix include/,$(INC))
-
-include $(RTE_SDK)/mk/rte.lib.mk
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation.
- * Copyright(c) 2014 6WIND S.A.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <syslog.h>
-#include <getopt.h>
-#include <sys/file.h>
-#include <stddef.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <sys/stat.h>
-
-#include <rte_compat.h>
-#include <rte_common.h>
-#include <rte_debug.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_errno.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_service_component.h>
-#include <rte_log.h>
-#include <rte_random.h>
-#include <rte_cycles.h>
-#include <rte_string_fns.h>
-#include <rte_cpuflags.h>
-#include <rte_interrupts.h>
-#include <rte_bus.h>
-#include <rte_dev.h>
-#include <rte_devargs.h>
-#include <rte_version.h>
-#include <rte_vfio.h>
-#include <rte_option.h>
-#include <rte_atomic.h>
-#include <malloc_heap.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_hugepages.h"
-#include "eal_options.h"
-#include "eal_memcfg.h"
-
-#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
-
-/* Allow the application to print its usage message too if set */
-static rte_usage_hook_t rte_application_usage_hook = NULL;
-/* early configuration structure, when memory config is not mmapped */
-static struct rte_mem_config early_mem_config;
-
-/* define fd variable here, because file needs to be kept open for the
- * duration of the program, as we hold a write lock on it in the primary proc */
-static int mem_cfg_fd = -1;
-
-static struct flock wr_lock = {
- .l_type = F_WRLCK,
- .l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memsegs),
- .l_len = sizeof(early_mem_config.memsegs),
-};
-
-/* Address of global and public configuration */
-static struct rte_config rte_config = {
- .mem_config = &early_mem_config,
-};
-
-/* internal configuration (per-core) */
-struct lcore_config lcore_config[RTE_MAX_LCORE];
-
-/* internal configuration */
-struct internal_config internal_config;
-
-/* used by rte_rdtsc() */
-int rte_cycles_vmware_tsc_map;
-
-/* platform-specific runtime dir */
-static char runtime_dir[PATH_MAX];
-
-static const char *default_runtime_dir = "/var/run";
-
-int
-eal_create_runtime_dir(void)
-{
- const char *directory = default_runtime_dir;
- const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
- const char *fallback = "/tmp";
- char tmp[PATH_MAX];
- int ret;
-
- if (getuid() != 0) {
- /* try XDG path first, fall back to /tmp */
- if (xdg_runtime_dir != NULL)
- directory = xdg_runtime_dir;
- else
- directory = fallback;
- }
- /* create DPDK subdirectory under runtime dir */
- ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
- if (ret < 0 || ret == sizeof(tmp)) {
- RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
- return -1;
- }
-
- /* create prefix-specific subdirectory under DPDK runtime dir */
- ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
- tmp, eal_get_hugefile_prefix());
- if (ret < 0 || ret == sizeof(runtime_dir)) {
- RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
- return -1;
- }
-
- /* create the path if it doesn't exist. no "mkdir -p" here, so do it
- * step by step.
- */
- ret = mkdir(tmp, 0700);
- if (ret < 0 && errno != EEXIST) {
- RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
- tmp, strerror(errno));
- return -1;
- }
-
- ret = mkdir(runtime_dir, 0700);
- if (ret < 0 && errno != EEXIST) {
- RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
- runtime_dir, strerror(errno));
- return -1;
- }
-
- return 0;
-}
-
-int
-eal_clean_runtime_dir(void)
-{
- /* FreeBSD doesn't need this implemented for now, because, unlike Linux,
- * FreeBSD doesn't create per-process files, so no need to clean up.
- */
- return 0;
-}
-
-
-const char *
-rte_eal_get_runtime_dir(void)
-{
- return runtime_dir;
-}
-
-/* Return user provided mbuf pool ops name */
-const char *
-rte_eal_mbuf_user_pool_ops(void)
-{
- return internal_config.user_mbuf_pool_ops_name;
-}
-
-/* Return a pointer to the configuration structure */
-struct rte_config *
-rte_eal_get_configuration(void)
-{
- return &rte_config;
-}
-
-enum rte_iova_mode
-rte_eal_iova_mode(void)
-{
- return rte_eal_get_configuration()->iova_mode;
-}
-
-/* parse a sysfs (or other) file containing one integer value */
-int
-eal_parse_sysfs_value(const char *filename, unsigned long *val)
-{
- FILE *f;
- char buf[BUFSIZ];
- char *end = NULL;
-
- if ((f = fopen(filename, "r")) == NULL) {
- RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
- __func__, filename);
- return -1;
- }
-
- if (fgets(buf, sizeof(buf), f) == NULL) {
- RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
- __func__, filename);
- fclose(f);
- return -1;
- }
- *val = strtoul(buf, &end, 0);
- if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
- RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
- __func__, filename);
- fclose(f);
- return -1;
- }
- fclose(f);
- return 0;
-}
-
-
-/* create memory configuration in shared/mmap memory. Take out
- * a write lock on the memsegs, so we can auto-detect primary/secondary.
- * This means we never close the file while running (auto-close on exit).
- * We also don't lock the whole file, so that in future we can use read-locks
- * on other parts, e.g. memzones, to detect if there are running secondary
- * processes. */
-static int
-rte_eal_config_create(void)
-{
- size_t page_sz = sysconf(_SC_PAGE_SIZE);
- size_t cfg_len = sizeof(*rte_config.mem_config);
- size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
- void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
- int retval;
-
- const char *pathname = eal_runtime_config_path();
-
- if (internal_config.no_shconf)
- return 0;
-
- /* map the config before base address so that we don't waste a page */
- if (internal_config.base_virtaddr != 0)
- rte_mem_cfg_addr = (void *)
- RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
- sizeof(struct rte_mem_config), page_sz);
- else
- rte_mem_cfg_addr = NULL;
-
- if (mem_cfg_fd < 0){
- mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
- if (mem_cfg_fd < 0) {
- RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
- }
-
- retval = ftruncate(mem_cfg_fd, cfg_len);
- if (retval < 0){
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
-
- retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
- if (retval < 0){
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
- "process running?\n", pathname);
- return -1;
- }
-
- /* reserve space for config */
- rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
- &cfg_len_aligned, page_sz, 0, 0);
- if (rte_mem_cfg_addr == NULL) {
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- return -1;
- }
-
- /* remap the actual file into the space we've just reserved */
- mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
- cfg_len_aligned, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
- if (mapped_mem_cfg_addr == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
- munmap(rte_mem_cfg_addr, cfg_len);
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- return -1;
- }
-
- memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
- rte_config.mem_config = rte_mem_cfg_addr;
-
- /* store address of the config in the config itself so that secondary
- * processes could later map the config into this exact location
- */
- rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
-
- return 0;
-}
-
-/* attach to an existing shared memory config */
-static int
-rte_eal_config_attach(void)
-{
- void *rte_mem_cfg_addr;
- const char *pathname = eal_runtime_config_path();
-
- if (internal_config.no_shconf)
- return 0;
-
- if (mem_cfg_fd < 0){
- mem_cfg_fd = open(pathname, O_RDWR);
- if (mem_cfg_fd < 0) {
- RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
- }
-
- rte_mem_cfg_addr = mmap(NULL, sizeof(*rte_config.mem_config),
- PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
- /* don't close the fd here, it will be closed on reattach */
- if (rte_mem_cfg_addr == MAP_FAILED) {
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- rte_config.mem_config = rte_mem_cfg_addr;
-
- return 0;
-}
-
-/* reattach the shared config at exact memory location primary process has it */
-static int
-rte_eal_config_reattach(void)
-{
- struct rte_mem_config *mem_config;
- void *rte_mem_cfg_addr;
-
- if (internal_config.no_shconf)
- return 0;
-
- /* save the address primary process has mapped shared config to */
- rte_mem_cfg_addr =
- (void *)(uintptr_t)rte_config.mem_config->mem_cfg_addr;
-
- /* unmap original config */
- munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
-
- /* remap the config at proper address */
- mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
- sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_cfg_fd, 0);
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
-
- if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
- if (mem_config != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]"
- " - please use '--" OPT_BASE_VIRTADDR
- "' option\n",
- rte_mem_cfg_addr, mem_config);
- munmap(mem_config, sizeof(struct rte_mem_config));
- return -1;
- }
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- rte_config.mem_config = mem_config;
-
- return 0;
-}
-
-/* Detect if we are a primary or a secondary process */
-enum rte_proc_type_t
-eal_proc_type_detect(void)
-{
- enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
- const char *pathname = eal_runtime_config_path();
-
- /* if there no shared config, there can be no secondary processes */
- if (!internal_config.no_shconf) {
- /* if we can open the file but not get a write-lock we are a
- * secondary process. NOTE: if we get a file handle back, we
- * keep that open and don't close it to prevent a race condition
- * between multiple opens.
- */
- if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
- (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
- ptype = RTE_PROC_SECONDARY;
- }
-
- RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
- ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
-
- return ptype;
-}
-
-/* Sets up rte_config structure with the pointer to shared memory config.*/
-static int
-rte_config_init(void)
-{
- rte_config.process_type = internal_config.process_type;
-
- switch (rte_config.process_type){
- case RTE_PROC_PRIMARY:
- if (rte_eal_config_create() < 0)
- return -1;
- eal_mcfg_update_from_internal();
- break;
- case RTE_PROC_SECONDARY:
- if (rte_eal_config_attach() < 0)
- return -1;
- eal_mcfg_wait_complete();
- if (eal_mcfg_check_version() < 0) {
- RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
- return -1;
- }
- if (rte_eal_config_reattach() < 0)
- return -1;
- eal_mcfg_update_internal();
- break;
- case RTE_PROC_AUTO:
- case RTE_PROC_INVALID:
- RTE_LOG(ERR, EAL, "Invalid process type %d\n",
- rte_config.process_type);
- return -1;
- }
-
- return 0;
-}
-
-/* display usage */
-static void
-eal_usage(const char *prgname)
-{
- printf("\nUsage: %s ", prgname);
- eal_common_usage();
- /* Allow the application to print its usage message too if hook is set */
- if ( rte_application_usage_hook ) {
- printf("===== Application Usage =====\n\n");
- rte_application_usage_hook(prgname);
- }
-}
-
-/* Set a per-application usage message */
-rte_usage_hook_t
-rte_set_application_usage_hook( rte_usage_hook_t usage_func )
-{
- rte_usage_hook_t old_func;
-
- /* Will be NULL on the first call to denote the last usage routine. */
- old_func = rte_application_usage_hook;
- rte_application_usage_hook = usage_func;
-
- return old_func;
-}
-
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
- uint64_t size = 0;
- unsigned i, j;
-
- for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
- struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
- for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
- size += hpi->hugepage_sz * hpi->num_pages[j];
- }
- }
- }
-
- return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
-/* Parse the arguments for --log-level only */
-static void
-eal_log_level_parse(int argc, char **argv)
-{
- int opt;
- char **argvopt;
- int option_index;
- const int old_optind = optind;
- const int old_optopt = optopt;
- const int old_optreset = optreset;
- char * const old_optarg = optarg;
-
- argvopt = argv;
- optind = 1;
- optreset = 1;
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- int ret;
-
- /* getopt is not happy, stop right now */
- if (opt == '?')
- break;
-
- ret = (opt == OPT_LOG_LEVEL_NUM) ?
- eal_parse_common_option(opt, optarg, &internal_config) : 0;
-
- /* common parser is not happy */
- if (ret < 0)
- break;
- }
-
- /* restore getopt lib */
- optind = old_optind;
- optopt = old_optopt;
- optreset = old_optreset;
- optarg = old_optarg;
-}
-
-/* Parse the argument given in the command line of the application */
-static int
-eal_parse_args(int argc, char **argv)
-{
- int opt, ret;
- char **argvopt;
- int option_index;
- char *prgname = argv[0];
- const int old_optind = optind;
- const int old_optopt = optopt;
- const int old_optreset = optreset;
- char * const old_optarg = optarg;
-
- argvopt = argv;
- optind = 1;
- optreset = 1;
- opterr = 0;
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- /*
- * getopt didn't recognise the option, lets parse the
- * registered options to see if the flag is valid
- */
- if (opt == '?') {
- ret = rte_option_parse(argv[optind-1]);
- if (ret == 0)
- continue;
-
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
-
- ret = eal_parse_common_option(opt, optarg, &internal_config);
- /* common parser is not happy */
- if (ret < 0) {
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- /* common parser handled this option */
- if (ret == 0)
- continue;
-
- switch (opt) {
- case OPT_MBUF_POOL_OPS_NAME_NUM:
- {
- char *ops_name = strdup(optarg);
- if (ops_name == NULL)
- RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
- else {
- /* free old ops name */
- if (internal_config.user_mbuf_pool_ops_name !=
- NULL)
- free(internal_config.user_mbuf_pool_ops_name);
-
- internal_config.user_mbuf_pool_ops_name =
- ops_name;
- }
- break;
- }
- case 'h':
- eal_usage(prgname);
- exit(EXIT_SUCCESS);
- default:
- if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
- RTE_LOG(ERR, EAL, "Option %c is not supported "
- "on FreeBSD\n", opt);
- } else if (opt >= OPT_LONG_MIN_NUM &&
- opt < OPT_LONG_MAX_NUM) {
- RTE_LOG(ERR, EAL, "Option %s is not supported "
- "on FreeBSD\n",
- eal_long_options[option_index].name);
- } else {
- RTE_LOG(ERR, EAL, "Option %d is not supported "
- "on FreeBSD\n", opt);
- }
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- }
-
- /* create runtime data directory */
- if (internal_config.no_shconf == 0 &&
- eal_create_runtime_dir() < 0) {
- RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
- ret = -1;
- goto out;
- }
-
- if (eal_adjust_config(&internal_config) != 0) {
- ret = -1;
- goto out;
- }
-
- /* sanity checks */
- if (eal_check_common_options(&internal_config) != 0) {
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
-
- if (optind >= 0)
- argv[optind-1] = prgname;
- ret = optind-1;
-
-out:
- /* restore getopt lib */
- optind = old_optind;
- optopt = old_optopt;
- optreset = old_optreset;
- optarg = old_optarg;
-
- return ret;
-}
-
-static int
-check_socket(const struct rte_memseg_list *msl, void *arg)
-{
- int *socket_id = arg;
-
- if (msl->external)
- return 0;
-
- if (msl->socket_id == *socket_id && msl->memseg_arr.count != 0)
- return 1;
-
- return 0;
-}
-
-static void
-eal_check_mem_on_local_socket(void)
-{
- int socket_id;
-
- socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
-
- if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
- RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
-}
-
-
-static int
-sync_func(__attribute__((unused)) void *arg)
-{
- return 0;
-}
-
-/* return non-zero if hugepages are enabled. */
-int rte_eal_has_hugepages(void)
-{
- return !internal_config.no_hugetlbfs;
-}
-
-/* Abstraction for port I/0 privilege */
-int
-rte_eal_iopl_init(void)
-{
- static int fd = -1;
-
- if (fd < 0)
- fd = open("/dev/io", O_RDWR);
-
- if (fd < 0)
- return -1;
- /* keep fd open for iopl */
- return 0;
-}
-
-static void rte_eal_init_alert(const char *msg)
-{
- fprintf(stderr, "EAL: FATAL: %s\n", msg);
- RTE_LOG(ERR, EAL, "%s\n", msg);
-}
-
-/* Launch threads, called at application init(). */
-int
-rte_eal_init(int argc, char **argv)
-{
- int i, fctret, ret;
- pthread_t thread_id;
- static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
- char cpuset[RTE_CPU_AFFINITY_STR_LEN];
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
- /* checks if the machine is adequate */
- if (!rte_cpu_is_supported()) {
- rte_eal_init_alert("unsupported cpu type.");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- if (!rte_atomic32_test_and_set(&run_once)) {
- rte_eal_init_alert("already called initialization.");
- rte_errno = EALREADY;
- return -1;
- }
-
- thread_id = pthread_self();
-
- eal_reset_internal_config(&internal_config);
-
- /* set log level as early as possible */
- eal_log_level_parse(argc, argv);
-
- if (rte_eal_cpu_init() < 0) {
- rte_eal_init_alert("Cannot detect lcores.");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- fctret = eal_parse_args(argc, argv);
- if (fctret < 0) {
- rte_eal_init_alert("Invalid 'command line' arguments.");
- rte_errno = EINVAL;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- /* FreeBSD always uses legacy memory model */
- internal_config.legacy_mem = true;
-
- if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins");
- rte_errno = EINVAL;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- if (eal_option_device_parse()) {
- rte_errno = ENODEV;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- if (rte_config_init() < 0) {
- rte_eal_init_alert("Cannot init config");
- return -1;
- }
-
- if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread");
- return -1;
- }
-
- if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init alarm");
- /* rte_eal_alarm_init sets rte_errno on failure. */
- return -1;
- }
-
- /* Put mp channel init before bus scan so that we can init the vdev
- * bus through mp channel in the secondary process before the bus scan.
- */
- if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
- rte_eal_init_alert("failed to init mp channel");
- if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
- rte_errno = EFAULT;
- return -1;
- }
- }
-
- if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices");
- rte_errno = ENODEV;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
- if (internal_config.iova_mode == RTE_IOVA_DC) {
- /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
- enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
-
- if (iova_mode == RTE_IOVA_DC)
- iova_mode = RTE_IOVA_PA;
- rte_eal_get_configuration()->iova_mode = iova_mode;
- } else {
- rte_eal_get_configuration()->iova_mode =
- internal_config.iova_mode;
- }
-
- RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
- rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
-
- if (internal_config.no_hugetlbfs == 0) {
- /* rte_config isn't initialized yet */
- ret = internal_config.process_type == RTE_PROC_PRIMARY ?
- eal_hugepage_info_init() :
- eal_hugepage_info_read();
- if (ret < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
- }
- }
-
- if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
- if (internal_config.no_hugetlbfs)
- internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
- else
- internal_config.memory = eal_get_hugepage_mem_size();
- }
-
- if (internal_config.vmware_tsc_map == 1) {
-#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
- rte_cycles_vmware_tsc_map = 1;
- RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
- "you must have monitor_control.pseudo_perfctr = TRUE\n");
-#else
- RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
- "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
-#endif
- }
-
- /* in secondary processes, memory init may allocate additional fbarrays
- * not present in primary processes, so to avoid any potential issues,
- * initialize memzones first.
- */
- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone");
- rte_errno = ENODEV;
- return -1;
- }
-
- if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory");
- rte_errno = ENOMEM;
- return -1;
- }
-
- if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap");
- rte_errno = ENODEV;
- return -1;
- }
-
- if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects");
- rte_errno = EFAULT;
- return -1;
- }
-
- if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- eal_check_mem_on_local_socket();
-
- eal_thread_init_master(rte_config.master_lcore);
-
- ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
- RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
- rte_config.master_lcore, thread_id, cpuset,
- ret == 0 ? "" : "...");
-
- RTE_LCORE_FOREACH_SLAVE(i) {
-
- /*
- * create communication pipes between master thread
- * and children
- */
- if (pipe(lcore_config[i].pipe_master2slave) < 0)
- rte_panic("Cannot create pipe\n");
- if (pipe(lcore_config[i].pipe_slave2master) < 0)
- rte_panic("Cannot create pipe\n");
-
- lcore_config[i].state = WAIT;
-
- /* create a thread for each lcore */
- ret = pthread_create(&lcore_config[i].thread_id, NULL,
- eal_thread_loop, NULL);
- if (ret != 0)
- rte_panic("Cannot create thread\n");
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, sizeof(thread_name),
- "lcore-slave-%d", i);
- rte_thread_setname(lcore_config[i].thread_id, thread_name);
- }
-
- /*
- * Launch a dummy function on all slave lcores, so that master lcore
- * knows they are all ready when this function returns.
- */
- rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
- rte_eal_mp_wait_lcore();
-
- /* initialize services so vdevs register service during bus_probe. */
- ret = rte_service_init();
- if (ret) {
- rte_eal_init_alert("rte_service_init() failed");
- rte_errno = ENOEXEC;
- return -1;
- }
-
- /* Probe all the buses and devices/drivers on them */
- if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- /* initialize default service/lcore mappings and start running. Ignore
- * -ENOTSUP, as it indicates no service coremask passed to EAL.
- */
- ret = rte_service_start_with_defaults();
- if (ret < 0 && ret != -ENOTSUP) {
- rte_errno = ENOEXEC;
- return -1;
- }
-
- /*
- * Clean up unused files in runtime directory. We do this at the end of
- * init and not at the beginning because we want to clean stuff up
- * whether we are primary or secondary process, but we cannot remove
- * primary process' files because secondary should be able to run even
- * if primary process is dead.
- *
- * In no_shconf mode, no runtime directory is created in the first
- * place, so no cleanup needed.
- */
- if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
- rte_eal_init_alert("Cannot clear runtime directory\n");
- return -1;
- }
-
- eal_mcfg_complete();
-
- /* Call each registered callback, if enabled */
- rte_option_init();
-
- return fctret;
-}
-
-int
-rte_eal_cleanup(void)
-{
- rte_service_finalize();
- rte_mp_channel_cleanup();
- eal_cleanup_config(&internal_config);
- return 0;
-}
-
-enum rte_proc_type_t
-rte_eal_process_type(void)
-{
- return rte_config.process_type;
-}
-
-int rte_eal_has_pci(void)
-{
- return !internal_config.no_pci;
-}
-
-int rte_eal_create_uio_dev(void)
-{
- return internal_config.create_uio_dev;
-}
-
-enum rte_intr_mode
-rte_eal_vfio_intr_mode(void)
-{
- return RTE_INTR_MODE_NONE;
-}
-
-int rte_vfio_setup_device(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr,
- __rte_unused int *vfio_dev_fd,
- __rte_unused struct vfio_device_info *device_info)
-{
- return -1;
-}
-
-int rte_vfio_release_device(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr,
- __rte_unused int fd)
-{
- return -1;
-}
-
-int rte_vfio_enable(__rte_unused const char *modname)
-{
- return -1;
-}
-
-int rte_vfio_is_enabled(__rte_unused const char *modname)
-{
- return 0;
-}
-
-int rte_vfio_noiommu_is_enabled(void)
-{
- return 0;
-}
-
-int rte_vfio_clear_group(__rte_unused int vfio_group_fd)
-{
- return 0;
-}
-
-int
-rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr,
- __rte_unused int *iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_get_container_fd(void)
-{
- return -1;
-}
-
-int
-rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_create(void)
-{
- return -1;
-}
-
-int
-rte_vfio_container_destroy(__rte_unused int container_fd)
-{
- return -1;
-}
-
-int
-rte_vfio_container_group_bind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_group_unbind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_dma_map(__rte_unused int container_fd,
- __rte_unused uint64_t vaddr,
- __rte_unused uint64_t iova,
- __rte_unused uint64_t len)
-{
- return -1;
-}
-
-int
-rte_vfio_container_dma_unmap(__rte_unused int container_fd,
- __rte_unused uint64_t vaddr,
- __rte_unused uint64_t iova,
- __rte_unused uint64_t len)
-{
- return -1;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <time.h>
-#include <errno.h>
-
-#include <rte_alarm.h>
-#include <rte_cycles.h>
-#include <rte_common.h>
-#include <rte_errno.h>
-#include <rte_interrupts.h>
-#include <rte_spinlock.h>
-
-#include "eal_private.h"
-#include "eal_alarm_private.h"
-
-#define NS_PER_US 1000
-
-#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
-#else
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC
-#endif
-
-struct alarm_entry {
- LIST_ENTRY(alarm_entry) next;
- struct rte_intr_handle handle;
- struct timespec time;
- rte_eal_alarm_callback cb_fn;
- void *cb_arg;
- volatile uint8_t executing;
- volatile pthread_t executing_id;
-};
-
-static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
-static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
-
-static struct rte_intr_handle intr_handle = {.fd = -1 };
-static void eal_alarm_callback(void *arg);
-
-int
-rte_eal_alarm_init(void)
-{
- intr_handle.type = RTE_INTR_HANDLE_ALARM;
-
- /* on FreeBSD, timers don't use fd's, and their identifiers are stored
- * in separate namespace from fd's, so using any value is OK. however,
- * EAL interrupts handler expects fd's to be unique, so use an actual fd
- * to guarantee unique timer identifier.
- */
- intr_handle.fd = open("/dev/zero", O_RDONLY);
-
- return 0;
-}
-
-static inline int
-timespec_cmp(const struct timespec *now, const struct timespec *at)
-{
- if (now->tv_sec < at->tv_sec)
- return -1;
- if (now->tv_sec > at->tv_sec)
- return 1;
- if (now->tv_nsec < at->tv_nsec)
- return -1;
- if (now->tv_nsec > at->tv_nsec)
- return 1;
- return 0;
-}
-
-static inline uint64_t
-diff_ns(struct timespec *now, struct timespec *at)
-{
- uint64_t now_ns, at_ns;
-
- if (timespec_cmp(now, at) >= 0)
- return 0;
-
- now_ns = now->tv_sec * NS_PER_S + now->tv_nsec;
- at_ns = at->tv_sec * NS_PER_S + at->tv_nsec;
-
- return at_ns - now_ns;
-}
-
-int
-eal_alarm_get_timeout_ns(uint64_t *val)
-{
- struct alarm_entry *ap;
- struct timespec now;
-
- if (clock_gettime(CLOCK_TYPE_ID, &now) < 0)
- return -1;
-
- if (LIST_EMPTY(&alarm_list))
- return -1;
-
- ap = LIST_FIRST(&alarm_list);
-
- *val = diff_ns(&now, &ap->time);
-
- return 0;
-}
-
-static int
-unregister_current_callback(void)
-{
- struct alarm_entry *ap;
- int ret = 0;
-
- if (!LIST_EMPTY(&alarm_list)) {
- ap = LIST_FIRST(&alarm_list);
-
- do {
- ret = rte_intr_callback_unregister(&intr_handle,
- eal_alarm_callback, &ap->time);
- } while (ret == -EAGAIN);
- }
-
- return ret;
-}
-
-static int
-register_first_callback(void)
-{
- struct alarm_entry *ap;
- int ret = 0;
-
- if (!LIST_EMPTY(&alarm_list)) {
- ap = LIST_FIRST(&alarm_list);
-
- /* register a new callback */
- ret = rte_intr_callback_register(&intr_handle,
- eal_alarm_callback, &ap->time);
- }
- return ret;
-}
-
-static void
-eal_alarm_callback(void *arg __rte_unused)
-{
- struct timespec now;
- struct alarm_entry *ap;
-
- rte_spinlock_lock(&alarm_list_lk);
- ap = LIST_FIRST(&alarm_list);
-
- if (clock_gettime(CLOCK_TYPE_ID, &now) < 0)
- return;
-
- while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) {
- ap->executing = 1;
- ap->executing_id = pthread_self();
- rte_spinlock_unlock(&alarm_list_lk);
-
- ap->cb_fn(ap->cb_arg);
-
- rte_spinlock_lock(&alarm_list_lk);
-
- LIST_REMOVE(ap, next);
- free(ap);
-
- ap = LIST_FIRST(&alarm_list);
- }
-
- /* timer has been deleted from the kqueue, so recreate it if needed */
- register_first_callback();
-
- rte_spinlock_unlock(&alarm_list_lk);
-}
-
-
-int
-rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
- struct alarm_entry *ap, *new_alarm;
- struct timespec now;
- uint64_t ns;
- int ret = 0;
-
- /* check parameters, also ensure us won't cause a uint64_t overflow */
- if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
- return -EINVAL;
-
- new_alarm = calloc(1, sizeof(*new_alarm));
- if (new_alarm == NULL)
- return -ENOMEM;
-
- /* use current time to calculate absolute time of alarm */
- clock_gettime(CLOCK_TYPE_ID, &now);
-
- ns = us * NS_PER_US;
-
- new_alarm->cb_fn = cb_fn;
- new_alarm->cb_arg = cb_arg;
- new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S;
- new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S);
-
- rte_spinlock_lock(&alarm_list_lk);
-
- if (LIST_EMPTY(&alarm_list))
- LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
- else {
- LIST_FOREACH(ap, &alarm_list, next) {
- if (timespec_cmp(&new_alarm->time, &ap->time) < 0) {
- LIST_INSERT_BEFORE(ap, new_alarm, next);
- break;
- }
- if (LIST_NEXT(ap, next) == NULL) {
- LIST_INSERT_AFTER(ap, new_alarm, next);
- break;
- }
- }
- }
-
- /* re-register first callback just in case */
- register_first_callback();
-
- rte_spinlock_unlock(&alarm_list_lk);
-
- return ret;
-}
-
-int
-rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
- struct alarm_entry *ap, *ap_prev;
- int count = 0;
- int err = 0;
- int executing;
-
- if (!cb_fn) {
- rte_errno = EINVAL;
- return -1;
- }
-
- do {
- executing = 0;
- rte_spinlock_lock(&alarm_list_lk);
- /* remove any matches at the start of the list */
- while (1) {
- ap = LIST_FIRST(&alarm_list);
- if (ap == NULL)
- break;
- if (cb_fn != ap->cb_fn)
- break;
- if (cb_arg != ap->cb_arg && cb_arg != (void *) -1)
- break;
- if (ap->executing == 0) {
- LIST_REMOVE(ap, next);
- free(ap);
- count++;
- } else {
- /* If calling from other context, mark that
- * alarm is executing so loop can spin till it
- * finish. Otherwise we are trying to cancel
- * ourselves - mark it by EINPROGRESS.
- */
- if (pthread_equal(ap->executing_id,
- pthread_self()) == 0)
- executing++;
- else
- err = EINPROGRESS;
-
- break;
- }
- }
- ap_prev = ap;
-
- /* now go through list, removing entries not at start */
- LIST_FOREACH(ap, &alarm_list, next) {
- /* this won't be true first time through */
- if (cb_fn == ap->cb_fn &&
- (cb_arg == (void *)-1 ||
- cb_arg == ap->cb_arg)) {
- if (ap->executing == 0) {
- LIST_REMOVE(ap, next);
- free(ap);
- count++;
- ap = ap_prev;
- } else if (pthread_equal(ap->executing_id,
- pthread_self()) == 0) {
- executing++;
- } else {
- err = EINPROGRESS;
- }
- }
- ap_prev = ap;
- }
- rte_spinlock_unlock(&alarm_list_lk);
- } while (executing != 0);
-
- if (count == 0 && err == 0)
- rte_errno = ENOENT;
- else if (err)
- rte_errno = err;
-
- rte_spinlock_lock(&alarm_list_lk);
-
- /* unregister if no alarms left, otherwise re-register first */
- if (LIST_EMPTY(&alarm_list))
- unregister_current_callback();
- else
- register_first_callback();
-
- rte_spinlock_unlock(&alarm_list_lk);
-
- return count;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018 Intel Corporation
- */
-
-#ifndef EAL_ALARM_PRIVATE_H
-#define EAL_ALARM_PRIVATE_H
-
-#include <inttypes.h>
-
-/*
- * FreeBSD needs a back-channel communication mechanism between interrupt and
- * alarm thread, because on FreeBSD, timer period is set up inside the interrupt
- * API and not inside alarm API like on Linux.
- */
-
-int
-eal_alarm_get_timeout_ns(uint64_t *val);
-
-#endif // EAL_ALARM_PRIVATE_H
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright 2018 Mellanox Technologies, Ltd
- */
-
-#include <rte_common.h>
-#include <rte_cpuflags.h>
-
-unsigned long
-rte_cpu_getauxval(unsigned long type __rte_unused)
-{
- /* not implemented */
- return 0;
-}
-
-int
-rte_cpu_strcmp_auxval(unsigned long type __rte_unused,
- const char *str __rte_unused)
-{
- /* not implemented */
- return -1;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#ifdef RTE_BACKTRACE
-#include <execinfo.h>
-#endif
-#include <stdarg.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-
-#include <rte_log.h>
-#include <rte_debug.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-
-#define BACKTRACE_SIZE 256
-
-/* dump the stack of the calling core */
-void rte_dump_stack(void)
-{
-#ifdef RTE_BACKTRACE
- void *func[BACKTRACE_SIZE];
- char **symb = NULL;
- int size;
-
- size = backtrace(func, BACKTRACE_SIZE);
- symb = backtrace_symbols(func, size);
-
- if (symb == NULL)
- return;
-
- while (size > 0) {
- rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
- "%d: [%s]\n", size, symb[size - 1]);
- size --;
- }
-
- free(symb);
-#endif /* RTE_BACKTRACE */
-}
-
-/* not implemented in this environment */
-void rte_dump_registers(void)
-{
- return;
-}
-
-/* call abort(), it will generate a coredump if enabled */
-void __rte_panic(const char *funcname, const char *format, ...)
-{
- va_list ap;
-
- rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
- va_start(ap, format);
- rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
- va_end(ap);
- rte_dump_stack();
- rte_dump_registers();
- abort();
-}
-
-/*
- * Like rte_panic this terminates the application. However, no traceback is
- * provided and no core-dump is generated.
- */
-void
-rte_exit(int exit_code, const char *format, ...)
-{
- va_list ap;
-
- if (exit_code != 0)
- RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
- " Cause: ", exit_code);
-
- va_start(ap, format);
- rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
- va_end(ap);
-
-#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
- if (rte_eal_cleanup() != 0)
- RTE_LOG(CRIT, EAL,
- "EAL could not release all resources\n");
- exit(exit_code);
-#else
- rte_dump_stack();
- rte_dump_registers();
- abort();
-#endif
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018 Intel Corporation
- */
-
-#include <rte_log.h>
-#include <rte_compat.h>
-#include <rte_dev.h>
-
-int
-rte_dev_event_monitor_start(void)
-{
- RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
- return -1;
-}
-
-int
-rte_dev_event_monitor_stop(void)
-{
- RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
- return -1;
-}
-
-int
-rte_dev_hotplug_handle_enable(void)
-{
- RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
- return -1;
-}
-
-int
-rte_dev_hotplug_handle_disable(void)
-{
- RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
- return -1;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/mman.h>
-#include <string.h>
-
-#include <rte_log.h>
-#include <fcntl.h>
-#include "eal_hugepages.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-
-#define CONTIGMEM_DEV "/dev/contigmem"
-
-/*
- * Uses mmap to create a shared memory area for storage of data
- * Used in this file to store the hugepage file map on disk
- */
-static void *
-map_shared_memory(const char *filename, const size_t mem_size, int flags)
-{
- void *retval;
- int fd = open(filename, flags, 0600);
- if (fd < 0)
- return NULL;
- if (ftruncate(fd, mem_size) < 0) {
- close(fd);
- return NULL;
- }
- retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- close(fd);
- return retval;
-}
-
-static void *
-open_shared_memory(const char *filename, const size_t mem_size)
-{
- return map_shared_memory(filename, mem_size, O_RDWR);
-}
-
-static void *
-create_shared_memory(const char *filename, const size_t mem_size)
-{
- return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
-}
-
-/*
- * No hugepage support on freebsd, but we dummy it, using contigmem driver
- */
-int
-eal_hugepage_info_init(void)
-{
- size_t sysctl_size;
- int num_buffers, fd, error;
- int64_t buffer_size;
- /* re-use the linux "internal config" structure for our memory data */
- struct hugepage_info *hpi = &internal_config.hugepage_info[0];
- struct hugepage_info *tmp_hpi;
- unsigned int i;
-
- internal_config.num_hugepage_sizes = 1;
-
- sysctl_size = sizeof(num_buffers);
- error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
- &sysctl_size, NULL, 0);
-
- if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n");
- return -1;
- }
-
- sysctl_size = sizeof(buffer_size);
- error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size,
- &sysctl_size, NULL, 0);
-
- if (error != 0) {
- RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n");
- return -1;
- }
-
- fd = open(CONTIGMEM_DEV, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n");
- return -1;
- }
-
- if (buffer_size >= 1<<30)
- RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n",
- num_buffers, (int)(buffer_size>>30));
- else if (buffer_size >= 1<<20)
- RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n",
- num_buffers, (int)(buffer_size>>20));
- else
- RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
- num_buffers, (int)(buffer_size>>10));
-
- strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir));
- hpi->hugepage_sz = buffer_size;
- hpi->num_pages[0] = num_buffers;
- hpi->lock_descriptor = fd;
-
- /* for no shared files mode, do not create shared memory config */
- if (internal_config.no_shconf)
- return 0;
-
- tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
- sizeof(internal_config.hugepage_info));
- if (tmp_hpi == NULL ) {
- RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
- return -1;
- }
-
- memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
-
- /* we've copied file descriptors along with everything else, but they
- * will be invalid in secondary process, so overwrite them
- */
- for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
- struct hugepage_info *tmp = &tmp_hpi[i];
- tmp->lock_descriptor = -1;
- }
-
- if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
- RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
- return -1;
- }
-
- return 0;
-}
-
-/* copy stuff from shared info into internal config */
-int
-eal_hugepage_info_read(void)
-{
- struct hugepage_info *hpi = &internal_config.hugepage_info[0];
- struct hugepage_info *tmp_hpi;
-
- internal_config.num_hugepage_sizes = 1;
-
- tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
- sizeof(internal_config.hugepage_info));
- if (tmp_hpi == NULL) {
- RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
- return -1;
- }
-
- memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
-
- if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
- RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
- return -1;
- }
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/event.h>
-#include <sys/queue.h>
-#include <unistd.h>
-
-#include <rte_errno.h>
-#include <rte_lcore.h>
-#include <rte_spinlock.h>
-#include <rte_common.h>
-#include <rte_interrupts.h>
-
-#include "eal_private.h"
-#include "eal_alarm_private.h"
-
-#define MAX_INTR_EVENTS 16
-
-/**
- * union buffer for reading on different devices
- */
-union rte_intr_read_buffer {
- char charbuf[16]; /* for others */
-};
-
-TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
-TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
-
-struct rte_intr_callback {
- TAILQ_ENTRY(rte_intr_callback) next;
- rte_intr_callback_fn cb_fn; /**< callback address */
- void *cb_arg; /**< parameter for callback */
- uint8_t pending_delete; /**< delete after callback is called */
- rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
-};
-
-struct rte_intr_source {
- TAILQ_ENTRY(rte_intr_source) next;
- struct rte_intr_handle intr_handle; /**< interrupt handle */
- struct rte_intr_cb_list callbacks; /**< user callbacks */
- uint32_t active;
-};
-
-/* global spinlock for interrupt data operation */
-static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
-
-/* interrupt sources list */
-static struct rte_intr_source_list intr_sources;
-
-/* interrupt handling thread */
-static pthread_t intr_thread;
-
-static volatile int kq = -1;
-
-static int
-intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke)
-{
- /* alarm callbacks are special case */
- if (ih->type == RTE_INTR_HANDLE_ALARM) {
- uint64_t timeout_ns;
-
- /* get soonest alarm timeout */
- if (eal_alarm_get_timeout_ns(&timeout_ns) < 0)
- return -1;
-
- ke->filter = EVFILT_TIMER;
- /* timers are one shot */
- ke->flags |= EV_ONESHOT;
- ke->fflags = NOTE_NSECONDS;
- ke->data = timeout_ns;
- } else {
- ke->filter = EVFILT_READ;
- }
- ke->ident = ih->fd;
-
- return 0;
-}
-
-int
-rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb, void *cb_arg)
-{
- struct rte_intr_callback *callback;
- struct rte_intr_source *src;
- int ret, add_event = 0;
-
- /* first do parameter checking */
- if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
- RTE_LOG(ERR, EAL,
- "Registering with invalid input parameter\n");
- return -EINVAL;
- }
- if (kq < 0) {
- RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq);
- return -ENODEV;
- }
-
- rte_spinlock_lock(&intr_lock);
-
- /* find the source for this intr_handle */
- TAILQ_FOREACH(src, &intr_sources, next) {
- if (src->intr_handle.fd == intr_handle->fd)
- break;
- }
-
- /* if this is an alarm interrupt and it already has a callback,
- * then we don't want to create a new callback because the only
- * thing on the list should be eal_alarm_callback() and we may
- * be called just to reset the timer.
- */
- if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM &&
- !TAILQ_EMPTY(&src->callbacks)) {
- callback = NULL;
- } else {
- /* allocate a new interrupt callback entity */
- callback = calloc(1, sizeof(*callback));
- if (callback == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- ret = -ENOMEM;
- goto fail;
- }
- callback->cb_fn = cb;
- callback->cb_arg = cb_arg;
- callback->pending_delete = 0;
- callback->ucb_fn = NULL;
-
- if (src == NULL) {
- src = calloc(1, sizeof(*src));
- if (src == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- ret = -ENOMEM;
- goto fail;
- } else {
- src->intr_handle = *intr_handle;
- TAILQ_INIT(&src->callbacks);
- TAILQ_INSERT_TAIL(&intr_sources, src, next);
- }
- }
-
- /* we had no interrupts for this */
- if (TAILQ_EMPTY(&src->callbacks))
- add_event = 1;
-
- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
- }
-
- /* add events to the queue. timer events are special as we need to
- * re-set the timer.
- */
- if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) {
- struct kevent ke;
-
- memset(&ke, 0, sizeof(ke));
- ke.flags = EV_ADD; /* mark for addition to the queue */
-
- if (intr_source_to_kevent(intr_handle, &ke) < 0) {
- RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n");
- ret = -ENODEV;
- goto fail;
- }
-
- /**
- * add the intr file descriptor into wait list.
- */
- if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
- /* currently, nic_uio does not support interrupts, so
- * this error will always be triggered and output to the
- * user. so, don't output it unless debug log level set.
- */
- if (errno == ENODEV)
- RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n",
- src->intr_handle.fd);
- else
- RTE_LOG(ERR, EAL, "Error adding fd %d "
- "kevent, %s\n",
- src->intr_handle.fd,
- strerror(errno));
- ret = -errno;
- goto fail;
- }
- }
- rte_spinlock_unlock(&intr_lock);
-
- return 0;
-fail:
- /* clean up */
- if (src != NULL) {
- if (callback != NULL)
- TAILQ_REMOVE(&(src->callbacks), callback, next);
- if (TAILQ_EMPTY(&(src->callbacks))) {
- TAILQ_REMOVE(&intr_sources, src, next);
- free(src);
- }
- }
- free(callback);
- rte_spinlock_unlock(&intr_lock);
- return ret;
-}
-
-int
-rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb_fn, void *cb_arg,
- rte_intr_unregister_callback_fn ucb_fn)
-{
- int ret;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb, *next;
-
- /* do parameter checking first */
- if (intr_handle == NULL || intr_handle->fd < 0) {
- RTE_LOG(ERR, EAL,
- "Unregistering with invalid input parameter\n");
- return -EINVAL;
- }
-
- if (kq < 0) {
- RTE_LOG(ERR, EAL, "Kqueue is not active\n");
- return -ENODEV;
- }
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if the insterrupt source for the fd is existent */
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd == intr_handle->fd)
- break;
-
- /* No interrupt source registered for the fd */
- if (src == NULL) {
- ret = -ENOENT;
-
- /* only usable if the source is active */
- } else if (src->active == 0) {
- ret = -EAGAIN;
-
- } else {
- ret = 0;
-
- /* walk through the callbacks and mark all that match. */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
- next = TAILQ_NEXT(cb, next);
- if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
- cb->cb_arg == cb_arg)) {
- cb->pending_delete = 1;
- cb->ucb_fn = ucb_fn;
- ret++;
- }
- }
- }
-
- rte_spinlock_unlock(&intr_lock);
-
- return ret;
-}
-
-int
-rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb_fn, void *cb_arg)
-{
- int ret;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb, *next;
-
- /* do parameter checking first */
- if (intr_handle == NULL || intr_handle->fd < 0) {
- RTE_LOG(ERR, EAL,
- "Unregistering with invalid input parameter\n");
- return -EINVAL;
- }
- if (kq < 0) {
- RTE_LOG(ERR, EAL, "Kqueue is not active\n");
- return -ENODEV;
- }
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if the insterrupt source for the fd is existent */
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd == intr_handle->fd)
- break;
-
- /* No interrupt source registered for the fd */
- if (src == NULL) {
- ret = -ENOENT;
-
- /* interrupt source has some active callbacks right now. */
- } else if (src->active != 0) {
- ret = -EAGAIN;
-
- /* ok to remove. */
- } else {
- struct kevent ke;
-
- ret = 0;
-
- /* remove it from the kqueue */
- memset(&ke, 0, sizeof(ke));
- ke.flags = EV_DELETE; /* mark for deletion from the queue */
-
- if (intr_source_to_kevent(intr_handle, &ke) < 0) {
- RTE_LOG(ERR, EAL, "Cannot convert to kevent\n");
- ret = -ENODEV;
- goto out;
- }
-
- /**
- * remove intr file descriptor from wait list.
- */
- if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
- RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n",
- src->intr_handle.fd, strerror(errno));
- /* removing non-existent even is an expected condition
- * in some circumstances (e.g. oneshot events).
- */
- }
-
- /*walk through the callbacks and remove all that match. */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
- next = TAILQ_NEXT(cb, next);
- if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
- cb->cb_arg == cb_arg)) {
- TAILQ_REMOVE(&src->callbacks, cb, next);
- free(cb);
- ret++;
- }
- }
-
- /* all callbacks for that source are removed. */
- if (TAILQ_EMPTY(&src->callbacks)) {
- TAILQ_REMOVE(&intr_sources, src, next);
- free(src);
- }
- }
-out:
- rte_spinlock_unlock(&intr_lock);
-
- return ret;
-}
-
-int
-rte_intr_enable(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
- return -1;
-
- switch (intr_handle->type) {
- /* not used at this moment */
- case RTE_INTR_HANDLE_ALARM:
- return -1;
- /* not used at this moment */
- case RTE_INTR_HANDLE_DEV_EVENT:
- return -1;
- /* unknown handle type */
- default:
- RTE_LOG(ERR, EAL,
- "Unknown handle type of fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-int
-rte_intr_disable(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
- return -1;
-
- switch (intr_handle->type) {
- /* not used at this moment */
- case RTE_INTR_HANDLE_ALARM:
- return -1;
- /* not used at this moment */
- case RTE_INTR_HANDLE_DEV_EVENT:
- return -1;
- /* unknown handle type */
- default:
- RTE_LOG(ERR, EAL,
- "Unknown handle type of fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-int
-rte_intr_ack(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- return -1;
-}
-
-static void
-eal_intr_process_interrupts(struct kevent *events, int nfds)
-{
- struct rte_intr_callback active_cb;
- union rte_intr_read_buffer buf;
- struct rte_intr_callback *cb, *next;
- struct rte_intr_source *src;
- bool call = false;
- int n, bytes_read;
- struct kevent ke;
-
- for (n = 0; n < nfds; n++) {
- int event_fd = events[n].ident;
-
- rte_spinlock_lock(&intr_lock);
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd == event_fd)
- break;
- if (src == NULL) {
- rte_spinlock_unlock(&intr_lock);
- continue;
- }
-
- /* mark this interrupt source as active and release the lock. */
- src->active = 1;
- rte_spinlock_unlock(&intr_lock);
-
- /* set the length to be read dor different handle type */
- switch (src->intr_handle.type) {
- case RTE_INTR_HANDLE_ALARM:
- bytes_read = 0;
- call = true;
- break;
- case RTE_INTR_HANDLE_VDEV:
- case RTE_INTR_HANDLE_EXT:
- bytes_read = 0;
- call = true;
- break;
- case RTE_INTR_HANDLE_DEV_EVENT:
- bytes_read = 0;
- call = true;
- break;
- default:
- bytes_read = 1;
- break;
- }
-
- if (bytes_read > 0) {
- /**
- * read out to clear the ready-to-be-read flag
- * for epoll_wait.
- */
- bytes_read = read(event_fd, &buf, bytes_read);
- if (bytes_read < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK)
- continue;
-
- RTE_LOG(ERR, EAL, "Error reading from file "
- "descriptor %d: %s\n",
- event_fd,
- strerror(errno));
- } else if (bytes_read == 0)
- RTE_LOG(ERR, EAL, "Read nothing from file "
- "descriptor %d\n", event_fd);
- else
- call = true;
- }
-
- /* grab a lock, again to call callbacks and update status. */
- rte_spinlock_lock(&intr_lock);
-
- if (call) {
- /* Finally, call all callbacks. */
- TAILQ_FOREACH(cb, &src->callbacks, next) {
-
- /* make a copy and unlock. */
- active_cb = *cb;
- rte_spinlock_unlock(&intr_lock);
-
- /* call the actual callback */
- active_cb.cb_fn(active_cb.cb_arg);
-
- /*get the lock back. */
- rte_spinlock_lock(&intr_lock);
- }
- }
-
- /* we done with that interrupt source, release it. */
- src->active = 0;
-
- /* check if any callback are supposed to be removed */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
- next = TAILQ_NEXT(cb, next);
- if (cb->pending_delete) {
- /* remove it from the kqueue */
- memset(&ke, 0, sizeof(ke));
- /* mark for deletion from the queue */
- ke.flags = EV_DELETE;
-
- if (intr_source_to_kevent(&src->intr_handle, &ke) < 0) {
- RTE_LOG(ERR, EAL, "Cannot convert to kevent\n");
- rte_spinlock_unlock(&intr_lock);
- return;
- }
-
- /**
- * remove intr file descriptor from wait list.
- */
- if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
- RTE_LOG(ERR, EAL, "Error removing fd %d kevent, "
- "%s\n", src->intr_handle.fd,
- strerror(errno));
- /* removing non-existent even is an expected
- * condition in some circumstances
- * (e.g. oneshot events).
- */
- }
-
- TAILQ_REMOVE(&src->callbacks, cb, next);
- if (cb->ucb_fn)
- cb->ucb_fn(&src->intr_handle, cb->cb_arg);
- free(cb);
- }
- }
-
- /* all callbacks for that source are removed. */
- if (TAILQ_EMPTY(&src->callbacks)) {
- TAILQ_REMOVE(&intr_sources, src, next);
- free(src);
- }
-
- rte_spinlock_unlock(&intr_lock);
- }
-}
-
-static void *
-eal_intr_thread_main(void *arg __rte_unused)
-{
- struct kevent events[MAX_INTR_EVENTS];
- int nfds;
-
- /* host thread, never break out */
- for (;;) {
- /* do not change anything, just wait */
- nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL);
-
- /* kevent fail */
- if (nfds < 0) {
- if (errno == EINTR)
- continue;
- RTE_LOG(ERR, EAL,
- "kevent returns with fail\n");
- break;
- }
- /* kevent timeout, will never happen here */
- else if (nfds == 0)
- continue;
-
- /* kevent has at least one fd ready to read */
- eal_intr_process_interrupts(events, nfds);
- }
- close(kq);
- kq = -1;
- return NULL;
-}
-
-int
-rte_eal_intr_init(void)
-{
- int ret = 0;
-
- /* init the global interrupt source head */
- TAILQ_INIT(&intr_sources);
-
- kq = kqueue();
- if (kq < 0) {
- RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n");
- return -1;
- }
-
- /* create the host thread to wait/handle the interrupt */
- ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
- eal_intr_thread_main, NULL);
- if (ret != 0) {
- rte_errno = -ret;
- RTE_LOG(ERR, EAL,
- "Failed to create thread for interrupt handling\n");
- }
-
- return ret;
-}
-
-int
-rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
- int epfd, int op, unsigned int vec, void *data)
-{
- RTE_SET_USED(intr_handle);
- RTE_SET_USED(epfd);
- RTE_SET_USED(op);
- RTE_SET_USED(vec);
- RTE_SET_USED(data);
-
- return -ENOTSUP;
-}
-
-int
-rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
-{
- RTE_SET_USED(intr_handle);
- RTE_SET_USED(nb_efd);
-
- return 0;
-}
-
-void
-rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
-{
- RTE_SET_USED(intr_handle);
-}
-
-int
-rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
-{
- RTE_SET_USED(intr_handle);
- return 0;
-}
-
-int
-rte_intr_allow_others(struct rte_intr_handle *intr_handle)
-{
- RTE_SET_USED(intr_handle);
- return 1;
-}
-
-int
-rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
-{
- RTE_SET_USED(intr_handle);
- return 0;
-}
-
-int
-rte_epoll_wait(int epfd, struct rte_epoll_event *events,
- int maxevents, int timeout)
-{
- RTE_SET_USED(epfd);
- RTE_SET_USED(events);
- RTE_SET_USED(maxevents);
- RTE_SET_USED(timeout);
-
- return -ENOTSUP;
-}
-
-int
-rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event)
-{
- RTE_SET_USED(epfd);
- RTE_SET_USED(op);
- RTE_SET_USED(fd);
- RTE_SET_USED(event);
-
- return -ENOTSUP;
-}
-
-int
-rte_intr_tls_epfd(void)
-{
- return -ENOTSUP;
-}
-
-void
-rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
-{
- RTE_SET_USED(intr_handle);
-}
-
-int rte_thread_is_intr(void)
-{
- return pthread_equal(intr_thread, pthread_self());
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <unistd.h>
-#include <sys/sysctl.h>
-
-#include <rte_log.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_common.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-
-/* No topology information available on FreeBSD including NUMA info */
-unsigned
-eal_cpu_core_id(__rte_unused unsigned lcore_id)
-{
- return 0;
-}
-
-static int
-eal_get_ncpus(void)
-{
- static int ncpu = -1;
- int mib[2] = {CTL_HW, HW_NCPU};
- size_t len = sizeof(ncpu);
-
- if (ncpu < 0) {
- sysctl(mib, 2, &ncpu, &len, NULL, 0);
- RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu);
- }
- return ncpu;
-}
-
-unsigned
-eal_cpu_socket_id(__rte_unused unsigned cpu_id)
-{
- return 0;
-}
-
-/* Check if a cpu is present by the presence of the
- * cpu information for it.
- */
-int
-eal_cpu_detected(unsigned lcore_id)
-{
- const unsigned ncpus = eal_get_ncpus();
- return lcore_id < ncpus;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017-2018 Intel Corporation
- */
-
-#include <inttypes.h>
-
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-
-#include "eal_memalloc.h"
-
-int
-eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused,
- int __rte_unused n_segs, size_t __rte_unused page_sz,
- int __rte_unused socket, bool __rte_unused exact)
-{
- RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
- return -1;
-}
-
-struct rte_memseg *
-eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket)
-{
- RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
- return NULL;
-}
-
-int
-eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused)
-{
- RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
- return -1;
-}
-
-int
-eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused,
- int n_segs __rte_unused)
-{
- RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
- return -1;
-}
-
-int
-eal_memalloc_sync_with_primary(void)
-{
- RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
- return -1;
-}
-
-int
-eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused)
-{
- return -ENOTSUP;
-}
-
-int
-eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused,
- int fd __rte_unused)
-{
- return -ENOTSUP;
-}
-
-int
-eal_memalloc_set_seg_list_fd(int list_idx __rte_unused, int fd __rte_unused)
-{
- return -ENOTSUP;
-}
-
-int
-eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused,
- int seg_idx __rte_unused, size_t *offset __rte_unused)
-{
- return -ENOTSUP;
-}
-
-int
-eal_memalloc_init(void)
-{
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-#include <sys/mman.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <inttypes.h>
-#include <errno.h>
-#include <string.h>
-#include <fcntl.h>
-
-#include <rte_eal.h>
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_string_fns.h>
-
-#include "eal_private.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_memcfg.h"
-#include "eal_options.h"
-
-#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE))
-
-uint64_t eal_get_baseaddr(void)
-{
- /*
- * FreeBSD may allocate something in the space we will be mapping things
- * before we get a chance to do that, so use a base address that's far
- * away from where malloc() et al usually map things.
- */
- return 0x1000000000ULL;
-}
-
-/*
- * Get physical address of any mapped virtual address in the current process.
- */
-phys_addr_t
-rte_mem_virt2phy(const void *virtaddr)
-{
- /* XXX not implemented. This function is only used by
- * rte_mempool_virt2iova() when hugepages are disabled. */
- (void)virtaddr;
- return RTE_BAD_IOVA;
-}
-rte_iova_t
-rte_mem_virt2iova(const void *virtaddr)
-{
- return rte_mem_virt2phy(virtaddr);
-}
-
-int
-rte_eal_hugepage_init(void)
-{
- struct rte_mem_config *mcfg;
- uint64_t total_mem = 0;
- void *addr;
- unsigned int i, j, seg_idx = 0;
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- /* for debug purposes, hugetlbfs can be disabled */
- if (internal_config.no_hugetlbfs) {
- struct rte_memseg_list *msl;
- struct rte_fbarray *arr;
- struct rte_memseg *ms;
- uint64_t page_sz;
- int n_segs, cur_seg;
-
- /* create a memseg list */
- msl = &mcfg->memsegs[0];
-
- page_sz = RTE_PGSIZE_4K;
- n_segs = internal_config.memory / page_sz;
-
- if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
- sizeof(struct rte_memseg))) {
- RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
- return -1;
- }
-
- addr = mmap(NULL, internal_config.memory,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (addr == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
- strerror(errno));
- return -1;
- }
- msl->base_va = addr;
- msl->page_sz = page_sz;
- msl->len = internal_config.memory;
- msl->socket_id = 0;
- msl->heap = 1;
-
- /* populate memsegs. each memseg is 1 page long */
- for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
- arr = &msl->memseg_arr;
-
- ms = rte_fbarray_get(arr, cur_seg);
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- ms->iova = (uintptr_t)addr;
- else
- ms->iova = RTE_BAD_IOVA;
- ms->addr = addr;
- ms->hugepage_sz = page_sz;
- ms->len = page_sz;
- ms->socket_id = 0;
-
- rte_fbarray_set_used(arr, cur_seg);
-
- addr = RTE_PTR_ADD(addr, page_sz);
- }
- return 0;
- }
-
- /* map all hugepages and sort them */
- for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
- struct hugepage_info *hpi;
- rte_iova_t prev_end = 0;
- int prev_ms_idx = -1;
- uint64_t page_sz, mem_needed;
- unsigned int n_pages, max_pages;
-
- hpi = &internal_config.hugepage_info[i];
- page_sz = hpi->hugepage_sz;
- max_pages = hpi->num_pages[0];
- mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
- page_sz);
-
- n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
-
- for (j = 0; j < n_pages; j++) {
- struct rte_memseg_list *msl;
- struct rte_fbarray *arr;
- struct rte_memseg *seg;
- int msl_idx, ms_idx;
- rte_iova_t physaddr;
- int error;
- size_t sysctl_size = sizeof(physaddr);
- char physaddr_str[64];
- bool is_adjacent;
-
- /* first, check if this segment is IOVA-adjacent to
- * the previous one.
- */
- snprintf(physaddr_str, sizeof(physaddr_str),
- "hw.contigmem.physaddr.%d", j);
- error = sysctlbyname(physaddr_str, &physaddr,
- &sysctl_size, NULL, 0);
- if (error < 0) {
- RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u "
- "from %s\n", j, hpi->hugedir);
- return -1;
- }
-
- is_adjacent = prev_end != 0 && physaddr == prev_end;
- prev_end = physaddr + hpi->hugepage_sz;
-
- for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
- msl_idx++) {
- bool empty, need_hole;
- msl = &mcfg->memsegs[msl_idx];
- arr = &msl->memseg_arr;
-
- if (msl->page_sz != page_sz)
- continue;
-
- empty = arr->count == 0;
-
- /* we need a hole if this isn't an empty memseg
- * list, and if previous segment was not
- * adjacent to current one.
- */
- need_hole = !empty && !is_adjacent;
-
- /* we need 1, plus hole if not adjacent */
- ms_idx = rte_fbarray_find_next_n_free(arr,
- 0, 1 + (need_hole ? 1 : 0));
-
- /* memseg list is full? */
- if (ms_idx < 0)
- continue;
-
- if (need_hole && prev_ms_idx == ms_idx - 1)
- ms_idx++;
- prev_ms_idx = ms_idx;
-
- break;
- }
- if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
- RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
- return -1;
- }
- arr = &msl->memseg_arr;
- seg = rte_fbarray_get(arr, ms_idx);
-
- addr = RTE_PTR_ADD(msl->base_va,
- (size_t)msl->page_sz * ms_idx);
-
- /* address is already mapped in memseg list, so using
- * MAP_FIXED here is safe.
- */
- addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
- MAP_SHARED | MAP_FIXED,
- hpi->lock_descriptor,
- j * EAL_PAGE_SIZE);
- if (addr == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
- j, hpi->hugedir);
- return -1;
- }
-
- seg->addr = addr;
- seg->iova = physaddr;
- seg->hugepage_sz = page_sz;
- seg->len = page_sz;
- seg->nchannel = mcfg->nchannel;
- seg->nrank = mcfg->nrank;
- seg->socket_id = 0;
-
- rte_fbarray_set_used(arr, ms_idx);
-
- RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
- PRIx64", len %zu\n",
- seg_idx++, addr, physaddr, page_sz);
-
- total_mem += seg->len;
- }
- if (total_mem >= internal_config.memory)
- break;
- }
- if (total_mem < internal_config.memory) {
- RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
- "requested: %" PRIu64 "M "
- "available: %" PRIu64 "M\n",
- internal_config.memory >> 20, total_mem >> 20);
- return -1;
- }
- return 0;
-}
-
-struct attach_walk_args {
- int fd_hugepage;
- int seg_idx;
-};
-static int
-attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
- void *arg)
-{
- struct attach_walk_args *wa = arg;
- void *addr;
-
- if (msl->external)
- return 0;
-
- addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
- wa->seg_idx * EAL_PAGE_SIZE);
- if (addr == MAP_FAILED || addr != ms->addr)
- return -1;
- wa->seg_idx++;
-
- return 0;
-}
-
-int
-rte_eal_hugepage_attach(void)
-{
- const struct hugepage_info *hpi;
- int fd_hugepage = -1;
- unsigned int i;
-
- hpi = &internal_config.hugepage_info[0];
-
- for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
- const struct hugepage_info *cur_hpi = &hpi[i];
- struct attach_walk_args wa;
-
- memset(&wa, 0, sizeof(wa));
-
- /* Obtain a file descriptor for contiguous memory */
- fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
- if (fd_hugepage < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- cur_hpi->hugedir);
- goto error;
- }
- wa.fd_hugepage = fd_hugepage;
- wa.seg_idx = 0;
-
- /* Map the contiguous memory into each memory segment */
- if (rte_memseg_walk(attach_segment, &wa) < 0) {
- RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
- wa.seg_idx, cur_hpi->hugedir);
- goto error;
- }
-
- close(fd_hugepage);
- fd_hugepage = -1;
- }
-
- /* hugepage_info is no longer required */
- return 0;
-
-error:
- if (fd_hugepage >= 0)
- close(fd_hugepage);
- return -1;
-}
-
-int
-rte_eal_using_phys_addrs(void)
-{
- return 0;
-}
-
-static uint64_t
-get_mem_amount(uint64_t page_sz, uint64_t max_mem)
-{
- uint64_t area_sz, max_pages;
-
- /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
- max_pages = RTE_MAX_MEMSEG_PER_LIST;
- max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
-
- area_sz = RTE_MIN(page_sz * max_pages, max_mem);
-
- /* make sure the list isn't smaller than the page size */
- area_sz = RTE_MAX(area_sz, page_sz);
-
- return RTE_ALIGN(area_sz, page_sz);
-}
-
-#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
-static int
-alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
- int n_segs, int socket_id, int type_msl_idx)
-{
- char name[RTE_FBARRAY_NAME_LEN];
-
- snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
- type_msl_idx);
- if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
- sizeof(struct rte_memseg))) {
- RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
- rte_strerror(rte_errno));
- return -1;
- }
-
- msl->page_sz = page_sz;
- msl->socket_id = socket_id;
- msl->base_va = NULL;
-
- RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
- (size_t)page_sz >> 10, socket_id);
-
- return 0;
-}
-
-static int
-alloc_va_space(struct rte_memseg_list *msl)
-{
- uint64_t page_sz;
- size_t mem_sz;
- void *addr;
- int flags = 0;
-
-#ifdef RTE_ARCH_PPC_64
- flags |= MAP_HUGETLB;
-#endif
-
- page_sz = msl->page_sz;
- mem_sz = page_sz * msl->memseg_arr.len;
-
- addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
- if (addr == NULL) {
- if (rte_errno == EADDRNOTAVAIL)
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - "
- "please use '--" OPT_BASE_VIRTADDR "' option\n",
- (unsigned long long)mem_sz, msl->base_va);
- else
- RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
- return -1;
- }
- msl->base_va = addr;
- msl->len = mem_sz;
-
- return 0;
-}
-
-
-static int
-memseg_primary_init(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int hpi_idx, msl_idx = 0;
- struct rte_memseg_list *msl;
- uint64_t max_mem, total_mem;
-
- /* no-huge does not need this at all */
- if (internal_config.no_hugetlbfs)
- return 0;
-
- /* FreeBSD has an issue where core dump will dump the entire memory
- * contents, including anonymous zero-page memory. Therefore, while we
- * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will
- * also be further limiting total memory amount to whatever memory is
- * available to us through contigmem driver (plus spacing blocks).
- *
- * so, at each stage, we will be checking how much memory we are
- * preallocating, and adjust all the values accordingly.
- */
-
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- total_mem = 0;
-
- /* create memseg lists */
- for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
- hpi_idx++) {
- uint64_t max_type_mem, total_type_mem = 0;
- uint64_t avail_mem;
- int type_msl_idx, max_segs, avail_segs, total_segs = 0;
- struct hugepage_info *hpi;
- uint64_t hugepage_sz;
-
- hpi = &internal_config.hugepage_info[hpi_idx];
- hugepage_sz = hpi->hugepage_sz;
-
- /* no NUMA support on FreeBSD */
-
- /* check if we've already exceeded total memory amount */
- if (total_mem >= max_mem)
- break;
-
- /* first, calculate theoretical limits according to config */
- max_type_mem = RTE_MIN(max_mem - total_mem,
- (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
- max_segs = RTE_MAX_MEMSEG_PER_TYPE;
-
- /* now, limit all of that to whatever will actually be
- * available to us, because without dynamic allocation support,
- * all of that extra memory will be sitting there being useless
- * and slowing down core dumps in case of a crash.
- *
- * we need (N*2)-1 segments because we cannot guarantee that
- * each segment will be IOVA-contiguous with the previous one,
- * so we will allocate more and put spaces inbetween segments
- * that are non-contiguous.
- */
- avail_segs = (hpi->num_pages[0] * 2) - 1;
- avail_mem = avail_segs * hugepage_sz;
-
- max_type_mem = RTE_MIN(avail_mem, max_type_mem);
- max_segs = RTE_MIN(avail_segs, max_segs);
-
- type_msl_idx = 0;
- while (total_type_mem < max_type_mem &&
- total_segs < max_segs) {
- uint64_t cur_max_mem, cur_mem;
- unsigned int n_segs;
-
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
-
- msl = &mcfg->memsegs[msl_idx++];
-
- cur_max_mem = max_type_mem - total_type_mem;
-
- cur_mem = get_mem_amount(hugepage_sz,
- cur_max_mem);
- n_segs = cur_mem / hugepage_sz;
-
- if (alloc_memseg_list(msl, hugepage_sz, n_segs,
- 0, type_msl_idx))
- return -1;
-
- total_segs += msl->memseg_arr.len;
- total_type_mem = total_segs * hugepage_sz;
- type_msl_idx++;
-
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
- return -1;
- }
- }
- total_mem += total_type_mem;
- }
- return 0;
-}
-
-static int
-memseg_secondary_init(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int msl_idx = 0;
- struct rte_memseg_list *msl;
-
- for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
-
- msl = &mcfg->memsegs[msl_idx];
-
- /* skip empty memseg lists */
- if (msl->memseg_arr.len == 0)
- continue;
-
- if (rte_fbarray_attach(&msl->memseg_arr)) {
- RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
- return -1;
- }
-
- /* preallocate VA space */
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-int
-rte_eal_memseg_init(void)
-{
- return rte_eal_process_type() == RTE_PROC_PRIMARY ?
- memseg_primary_init() :
- memseg_secondary_init();
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <sched.h>
-#include <pthread_np.h>
-#include <sys/queue.h>
-#include <sys/thr.h>
-
-#include <rte_debug.h>
-#include <rte_atomic.h>
-#include <rte_launch.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_per_lcore.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-
-RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
-RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
-RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
-
-/*
- * Send a message to a slave lcore identified by slave_id to call a
- * function f with argument arg. Once the execution is done, the
- * remote lcore switch in FINISHED state.
- */
-int
-rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
-{
- int n;
- char c = 0;
- int m2s = lcore_config[slave_id].pipe_master2slave[1];
- int s2m = lcore_config[slave_id].pipe_slave2master[0];
-
- if (lcore_config[slave_id].state != WAIT)
- return -EBUSY;
-
- lcore_config[slave_id].f = f;
- lcore_config[slave_id].arg = arg;
-
- /* send message */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = write(m2s, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- /* wait ack */
- do {
- n = read(s2m, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- return 0;
-}
-
-/* set affinity for current thread */
-static int
-eal_thread_set_affinity(void)
-{
- unsigned lcore_id = rte_lcore_id();
-
- /* acquire system unique id */
- rte_gettid();
-
- /* update EAL thread core affinity */
- return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
-}
-
-void eal_thread_init_master(unsigned lcore_id)
-{
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-
- /* set CPU affinity */
- if (eal_thread_set_affinity() < 0)
- rte_panic("cannot set affinity\n");
-}
-
-/* main loop of threads */
-__attribute__((noreturn)) void *
-eal_thread_loop(__attribute__((unused)) void *arg)
-{
- char c;
- int n, ret;
- unsigned lcore_id;
- pthread_t thread_id;
- int m2s, s2m;
- char cpuset[RTE_CPU_AFFINITY_STR_LEN];
-
- thread_id = pthread_self();
-
- /* retrieve our lcore_id from the configuration structure */
- RTE_LCORE_FOREACH_SLAVE(lcore_id) {
- if (thread_id == lcore_config[lcore_id].thread_id)
- break;
- }
- if (lcore_id == RTE_MAX_LCORE)
- rte_panic("cannot retrieve lcore id\n");
-
- m2s = lcore_config[lcore_id].pipe_master2slave[0];
- s2m = lcore_config[lcore_id].pipe_slave2master[1];
-
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-
- /* set CPU affinity */
- if (eal_thread_set_affinity() < 0)
- rte_panic("cannot set affinity\n");
-
- ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
- lcore_id, thread_id, cpuset, ret == 0 ? "" : "...");
-
- /* read on our pipe to get commands */
- while (1) {
- void *fct_arg;
-
- /* wait command */
- do {
- n = read(m2s, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- lcore_config[lcore_id].state = RUNNING;
-
- /* send ack */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = write(s2m, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- if (lcore_config[lcore_id].f == NULL)
- rte_panic("NULL function pointer\n");
-
- /* call the function and store the return value */
- fct_arg = lcore_config[lcore_id].arg;
- ret = lcore_config[lcore_id].f(fct_arg);
- lcore_config[lcore_id].ret = ret;
- rte_wmb();
- lcore_config[lcore_id].state = FINISHED;
- }
-
- /* never reached */
- /* pthread_exit(NULL); */
- /* return NULL; */
-}
-
-/* require calling thread tid by gettid() */
-int rte_sys_gettid(void)
-{
- long lwpid;
- thr_self(&lwpid);
- return (int)lwpid;
-}
-
-int rte_thread_setname(pthread_t id, const char *name)
-{
- /* this BSD function returns no error */
- pthread_set_name_np(id, name);
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-#include <string.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <inttypes.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <errno.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_cycles.h>
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_internal_cfg.h"
-
-#ifdef RTE_LIBEAL_USE_HPET
-#warning HPET is not supported in FreeBSD
-#endif
-
-enum timer_source eal_timer_source = EAL_TIMER_TSC;
-
-uint64_t
-get_tsc_freq(void)
-{
- size_t sz;
- int tmp;
- uint64_t tsc_hz;
-
- sz = sizeof(tmp);
- tmp = 0;
-
- if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0))
- RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
- else if (tmp != 1)
- RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n");
-
- tmp = 0;
-
- if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0))
- RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
- else if (tmp != 1)
- RTE_LOG(WARNING, EAL, "TSC is not invariant\n");
-
- sz = sizeof(tsc_hz);
- if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) {
- RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
- return 0;
- }
-
- return tsc_hz;
-}
-
-int
-rte_eal_timer_init(void)
-{
- set_tsc_freq();
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2019 Intel Corporation
- */
-
-#ifndef _RTE_OS_H_
-#define _RTE_OS_H_
-
-/**
- * This is header should contain any function/macro definition
- * which are not supported natively or named differently in the
- * freebsd OS. Functions will be added in future releases.
- */
-
-#include <pthread_np.h>
-
-typedef cpuset_t rte_cpuset_t;
-#define RTE_CPU_AND(dst, src1, src2) do \
-{ \
- cpuset_t tmp; \
- CPU_COPY(src1, &tmp); \
- CPU_AND(&tmp, src2); \
- CPU_COPY(&tmp, dst); \
-} while (0)
-#define RTE_CPU_OR(dst, src1, src2) do \
-{ \
- cpuset_t tmp; \
- CPU_COPY(src1, &tmp); \
- CPU_OR(&tmp, src2); \
- CPU_COPY(&tmp, dst); \
-} while (0)
-#define RTE_CPU_FILL(set) CPU_FILL(set)
-
-/* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */
-#ifdef CPU_NAND
-#define RTE_CPU_NOT(dst, src) do \
-{ \
- cpuset_t tmp; \
- CPU_FILL(&tmp); \
- CPU_NAND(&tmp, src); \
- CPU_COPY(&tmp, dst); \
-} while (0)
-#else
-#define RTE_CPU_NOT(dst, src) do \
-{ \
- cpuset_t tmp; \
- CPU_FILL(&tmp); \
- CPU_ANDNOT(&tmp, src); \
- CPU_COPY(&tmp, dst); \
-} while (0)
-#endif
-
-#endif /* _RTE_OS_H_ */
+++ /dev/null
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017 Intel Corporation
-
-env_objs = []
-env_headers = files(
- 'include/rte_os.h',
-)
-env_sources = files('eal_alarm.c',
- 'eal_cpuflags.c',
- 'eal_debug.c',
- 'eal_hugepage_info.c',
- 'eal_interrupts.c',
- 'eal_lcore.c',
- 'eal_memalloc.c',
- 'eal_thread.c',
- 'eal_timer.c',
- 'eal.c',
- 'eal_memory.c',
- 'eal_dev.c'
-)
-
-deps += ['kvargs']
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <errno.h>
+
+#include <rte_alarm.h>
+#include <rte_cycles.h>
+#include <rte_common.h>
+#include <rte_errno.h>
+#include <rte_interrupts.h>
+#include <rte_spinlock.h>
+
+#include "eal_private.h"
+#include "eal_alarm_private.h"
+
+#define NS_PER_US 1000
+
+#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
+#else
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC
+#endif
+
+struct alarm_entry {
+ LIST_ENTRY(alarm_entry) next;
+ struct rte_intr_handle handle;
+ struct timespec time;
+ rte_eal_alarm_callback cb_fn;
+ void *cb_arg;
+ volatile uint8_t executing;
+ volatile pthread_t executing_id;
+};
+
+static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
+static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static void eal_alarm_callback(void *arg);
+
+int
+rte_eal_alarm_init(void)
+{
+ intr_handle.type = RTE_INTR_HANDLE_ALARM;
+
+ /* on FreeBSD, timers don't use fd's, and their identifiers are stored
+ * in separate namespace from fd's, so using any value is OK. however,
+ * EAL interrupts handler expects fd's to be unique, so use an actual fd
+ * to guarantee unique timer identifier.
+ */
+ intr_handle.fd = open("/dev/zero", O_RDONLY);
+
+ return 0;
+}
+
+static inline int
+timespec_cmp(const struct timespec *now, const struct timespec *at)
+{
+ if (now->tv_sec < at->tv_sec)
+ return -1;
+ if (now->tv_sec > at->tv_sec)
+ return 1;
+ if (now->tv_nsec < at->tv_nsec)
+ return -1;
+ if (now->tv_nsec > at->tv_nsec)
+ return 1;
+ return 0;
+}
+
+static inline uint64_t
+diff_ns(struct timespec *now, struct timespec *at)
+{
+ uint64_t now_ns, at_ns;
+
+ if (timespec_cmp(now, at) >= 0)
+ return 0;
+
+ now_ns = now->tv_sec * NS_PER_S + now->tv_nsec;
+ at_ns = at->tv_sec * NS_PER_S + at->tv_nsec;
+
+ return at_ns - now_ns;
+}
+
+int
+eal_alarm_get_timeout_ns(uint64_t *val)
+{
+ struct alarm_entry *ap;
+ struct timespec now;
+
+ if (clock_gettime(CLOCK_TYPE_ID, &now) < 0)
+ return -1;
+
+ if (LIST_EMPTY(&alarm_list))
+ return -1;
+
+ ap = LIST_FIRST(&alarm_list);
+
+ *val = diff_ns(&now, &ap->time);
+
+ return 0;
+}
+
+static int
+unregister_current_callback(void)
+{
+ struct alarm_entry *ap;
+ int ret = 0;
+
+ if (!LIST_EMPTY(&alarm_list)) {
+ ap = LIST_FIRST(&alarm_list);
+
+ do {
+ ret = rte_intr_callback_unregister(&intr_handle,
+ eal_alarm_callback, &ap->time);
+ } while (ret == -EAGAIN);
+ }
+
+ return ret;
+}
+
+static int
+register_first_callback(void)
+{
+ struct alarm_entry *ap;
+ int ret = 0;
+
+ if (!LIST_EMPTY(&alarm_list)) {
+ ap = LIST_FIRST(&alarm_list);
+
+ /* register a new callback */
+ ret = rte_intr_callback_register(&intr_handle,
+ eal_alarm_callback, &ap->time);
+ }
+ return ret;
+}
+
+static void
+eal_alarm_callback(void *arg __rte_unused)
+{
+ struct timespec now;
+ struct alarm_entry *ap;
+
+ rte_spinlock_lock(&alarm_list_lk);
+ ap = LIST_FIRST(&alarm_list);
+
+ if (clock_gettime(CLOCK_TYPE_ID, &now) < 0)
+ return;
+
+ while (ap != NULL && timespec_cmp(&now, &ap->time) >= 0) {
+ ap->executing = 1;
+ ap->executing_id = pthread_self();
+ rte_spinlock_unlock(&alarm_list_lk);
+
+ ap->cb_fn(ap->cb_arg);
+
+ rte_spinlock_lock(&alarm_list_lk);
+
+ LIST_REMOVE(ap, next);
+ free(ap);
+
+ ap = LIST_FIRST(&alarm_list);
+ }
+
+ /* timer has been deleted from the kqueue, so recreate it if needed */
+ register_first_callback();
+
+ rte_spinlock_unlock(&alarm_list_lk);
+}
+
+
+int
+rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+ struct alarm_entry *ap, *new_alarm;
+ struct timespec now;
+ uint64_t ns;
+ int ret = 0;
+
+ /* check parameters, also ensure us won't cause a uint64_t overflow */
+ if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
+ return -EINVAL;
+
+ new_alarm = calloc(1, sizeof(*new_alarm));
+ if (new_alarm == NULL)
+ return -ENOMEM;
+
+ /* use current time to calculate absolute time of alarm */
+ clock_gettime(CLOCK_TYPE_ID, &now);
+
+ ns = us * NS_PER_US;
+
+ new_alarm->cb_fn = cb_fn;
+ new_alarm->cb_arg = cb_arg;
+ new_alarm->time.tv_nsec = (now.tv_nsec + ns) % NS_PER_S;
+ new_alarm->time.tv_sec = now.tv_sec + ((now.tv_nsec + ns) / NS_PER_S);
+
+ rte_spinlock_lock(&alarm_list_lk);
+
+ if (LIST_EMPTY(&alarm_list))
+ LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
+ else {
+ LIST_FOREACH(ap, &alarm_list, next) {
+ if (timespec_cmp(&new_alarm->time, &ap->time) < 0) {
+ LIST_INSERT_BEFORE(ap, new_alarm, next);
+ break;
+ }
+ if (LIST_NEXT(ap, next) == NULL) {
+ LIST_INSERT_AFTER(ap, new_alarm, next);
+ break;
+ }
+ }
+ }
+
+ /* re-register first callback just in case */
+ register_first_callback();
+
+ rte_spinlock_unlock(&alarm_list_lk);
+
+ return ret;
+}
+
+int
+rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+ struct alarm_entry *ap, *ap_prev;
+ int count = 0;
+ int err = 0;
+ int executing;
+
+ if (!cb_fn) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ do {
+ executing = 0;
+ rte_spinlock_lock(&alarm_list_lk);
+ /* remove any matches at the start of the list */
+ while (1) {
+ ap = LIST_FIRST(&alarm_list);
+ if (ap == NULL)
+ break;
+ if (cb_fn != ap->cb_fn)
+ break;
+ if (cb_arg != ap->cb_arg && cb_arg != (void *) -1)
+ break;
+ if (ap->executing == 0) {
+ LIST_REMOVE(ap, next);
+ free(ap);
+ count++;
+ } else {
+ /* If calling from other context, mark that
+ * alarm is executing so loop can spin till it
+ * finish. Otherwise we are trying to cancel
+ * ourselves - mark it by EINPROGRESS.
+ */
+ if (pthread_equal(ap->executing_id,
+ pthread_self()) == 0)
+ executing++;
+ else
+ err = EINPROGRESS;
+
+ break;
+ }
+ }
+ ap_prev = ap;
+
+ /* now go through list, removing entries not at start */
+ LIST_FOREACH(ap, &alarm_list, next) {
+ /* this won't be true first time through */
+ if (cb_fn == ap->cb_fn &&
+ (cb_arg == (void *)-1 ||
+ cb_arg == ap->cb_arg)) {
+ if (ap->executing == 0) {
+ LIST_REMOVE(ap, next);
+ free(ap);
+ count++;
+ ap = ap_prev;
+ } else if (pthread_equal(ap->executing_id,
+ pthread_self()) == 0) {
+ executing++;
+ } else {
+ err = EINPROGRESS;
+ }
+ }
+ ap_prev = ap;
+ }
+ rte_spinlock_unlock(&alarm_list_lk);
+ } while (executing != 0);
+
+ if (count == 0 && err == 0)
+ rte_errno = ENOENT;
+ else if (err)
+ rte_errno = err;
+
+ rte_spinlock_lock(&alarm_list_lk);
+
+ /* unregister if no alarms left, otherwise re-register first */
+ if (LIST_EMPTY(&alarm_list))
+ unregister_current_callback();
+ else
+ register_first_callback();
+
+ rte_spinlock_unlock(&alarm_list_lk);
+
+ return count;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef EAL_ALARM_PRIVATE_H
+#define EAL_ALARM_PRIVATE_H
+
+#include <inttypes.h>
+
+/*
+ * FreeBSD needs a back-channel communication mechanism between interrupt and
+ * alarm thread, because on FreeBSD, timer period is set up inside the interrupt
+ * API and not inside alarm API like on Linux.
+ */
+
+int
+eal_alarm_get_timeout_ns(uint64_t *val);
+
+#endif // EAL_ALARM_PRIVATE_H
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Mellanox Technologies, Ltd
+ */
+
+#include <rte_common.h>
+#include <rte_cpuflags.h>
+
+unsigned long
+rte_cpu_getauxval(unsigned long type __rte_unused)
+{
+ /* not implemented */
+ return 0;
+}
+
+int
+rte_cpu_strcmp_auxval(unsigned long type __rte_unused,
+ const char *str __rte_unused)
+{
+ /* not implemented */
+ return -1;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifdef RTE_BACKTRACE
+#include <execinfo.h>
+#endif
+#include <stdarg.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#define BACKTRACE_SIZE 256
+
+/* dump the stack of the calling core */
+void rte_dump_stack(void)
+{
+#ifdef RTE_BACKTRACE
+ void *func[BACKTRACE_SIZE];
+ char **symb = NULL;
+ int size;
+
+ size = backtrace(func, BACKTRACE_SIZE);
+ symb = backtrace_symbols(func, size);
+
+ if (symb == NULL)
+ return;
+
+ while (size > 0) {
+ rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
+ "%d: [%s]\n", size, symb[size - 1]);
+ size --;
+ }
+
+ free(symb);
+#endif /* RTE_BACKTRACE */
+}
+
+/* not implemented in this environment */
+void rte_dump_registers(void)
+{
+ return;
+}
+
+/* call abort(), it will generate a coredump if enabled */
+void __rte_panic(const char *funcname, const char *format, ...)
+{
+ va_list ap;
+
+ rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
+ va_start(ap, format);
+ rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+ va_end(ap);
+ rte_dump_stack();
+ rte_dump_registers();
+ abort();
+}
+
+/*
+ * Like rte_panic this terminates the application. However, no traceback is
+ * provided and no core-dump is generated.
+ */
+void
+rte_exit(int exit_code, const char *format, ...)
+{
+ va_list ap;
+
+ if (exit_code != 0)
+ RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
+ " Cause: ", exit_code);
+
+ va_start(ap, format);
+ rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+ va_end(ap);
+
+#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
+ if (rte_eal_cleanup() != 0)
+ RTE_LOG(CRIT, EAL,
+ "EAL could not release all resources\n");
+ exit(exit_code);
+#else
+ rte_dump_stack();
+ rte_dump_registers();
+ abort();
+#endif
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <rte_log.h>
+#include <rte_compat.h>
+#include <rte_dev.h>
+
+int
+rte_dev_event_monitor_start(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
+
+int
+rte_dev_event_monitor_stop(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
+
+int
+rte_dev_hotplug_handle_enable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
+
+int
+rte_dev_hotplug_handle_disable(void)
+{
+ RTE_LOG(ERR, EAL, "Device event is not supported for FreeBSD\n");
+ return -1;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/mman.h>
+#include <string.h>
+
+#include <rte_log.h>
+#include <fcntl.h>
+#include "eal_hugepages.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+
+#define CONTIGMEM_DEV "/dev/contigmem"
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
+{
+ void *retval;
+ int fd = open(filename, flags, 0600);
+ if (fd < 0)
+ return NULL;
+ if (ftruncate(fd, mem_size) < 0) {
+ close(fd);
+ return NULL;
+ }
+ retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ return retval;
+}
+
+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
+
+/*
+ * No hugepage support on freebsd, but we dummy it, using contigmem driver
+ */
+int
+eal_hugepage_info_init(void)
+{
+ size_t sysctl_size;
+ int num_buffers, fd, error;
+ int64_t buffer_size;
+ /* re-use the linux "internal config" structure for our memory data */
+ struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+ struct hugepage_info *tmp_hpi;
+ unsigned int i;
+
+ internal_config.num_hugepage_sizes = 1;
+
+ sysctl_size = sizeof(num_buffers);
+ error = sysctlbyname("hw.contigmem.num_buffers", &num_buffers,
+ &sysctl_size, NULL, 0);
+
+ if (error != 0) {
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.num_buffers\n");
+ return -1;
+ }
+
+ sysctl_size = sizeof(buffer_size);
+ error = sysctlbyname("hw.contigmem.buffer_size", &buffer_size,
+ &sysctl_size, NULL, 0);
+
+ if (error != 0) {
+ RTE_LOG(ERR, EAL, "could not read sysctl hw.contigmem.buffer_size\n");
+ return -1;
+ }
+
+ fd = open(CONTIGMEM_DEV, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "could not open "CONTIGMEM_DEV"\n");
+ return -1;
+ }
+
+ if (buffer_size >= 1<<30)
+ RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dGB\n",
+ num_buffers, (int)(buffer_size>>30));
+ else if (buffer_size >= 1<<20)
+ RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dMB\n",
+ num_buffers, (int)(buffer_size>>20));
+ else
+ RTE_LOG(INFO, EAL, "Contigmem driver has %d buffers, each of size %dKB\n",
+ num_buffers, (int)(buffer_size>>10));
+
+ strlcpy(hpi->hugedir, CONTIGMEM_DEV, sizeof(hpi->hugedir));
+ hpi->hugepage_sz = buffer_size;
+ hpi->num_pages[0] = num_buffers;
+ hpi->lock_descriptor = fd;
+
+ /* for no shared files mode, do not create shared memory config */
+ if (internal_config.no_shconf)
+ return 0;
+
+ tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL ) {
+ RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+ return -1;
+ }
+
+ memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
+
+ /* we've copied file descriptors along with everything else, but they
+ * will be invalid in secondary process, so overwrite them
+ */
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ struct hugepage_info *tmp = &tmp_hpi[i];
+ tmp->lock_descriptor = -1;
+ }
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/* copy stuff from shared info into internal config */
+int
+eal_hugepage_info_read(void)
+{
+ struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+ struct hugepage_info *tmp_hpi;
+
+ internal_config.num_hugepage_sizes = 1;
+
+ tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+ return -1;
+ }
+
+ memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/queue.h>
+#include <unistd.h>
+
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+#include <rte_common.h>
+#include <rte_interrupts.h>
+
+#include "eal_private.h"
+#include "eal_alarm_private.h"
+
+#define MAX_INTR_EVENTS 16
+
+/**
+ * union buffer for reading on different devices
+ */
+union rte_intr_read_buffer {
+ char charbuf[16]; /* for others */
+};
+
+TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
+TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
+
+struct rte_intr_callback {
+ TAILQ_ENTRY(rte_intr_callback) next;
+ rte_intr_callback_fn cb_fn; /**< callback address */
+ void *cb_arg; /**< parameter for callback */
+ uint8_t pending_delete; /**< delete after callback is called */
+ rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
+};
+
+struct rte_intr_source {
+ TAILQ_ENTRY(rte_intr_source) next;
+ struct rte_intr_handle intr_handle; /**< interrupt handle */
+ struct rte_intr_cb_list callbacks; /**< user callbacks */
+ uint32_t active;
+};
+
+/* global spinlock for interrupt data operation */
+static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* interrupt sources list */
+static struct rte_intr_source_list intr_sources;
+
+/* interrupt handling thread */
+static pthread_t intr_thread;
+
+static volatile int kq = -1;
+
+static int
+intr_source_to_kevent(const struct rte_intr_handle *ih, struct kevent *ke)
+{
+ /* alarm callbacks are special case */
+ if (ih->type == RTE_INTR_HANDLE_ALARM) {
+ uint64_t timeout_ns;
+
+ /* get soonest alarm timeout */
+ if (eal_alarm_get_timeout_ns(&timeout_ns) < 0)
+ return -1;
+
+ ke->filter = EVFILT_TIMER;
+ /* timers are one shot */
+ ke->flags |= EV_ONESHOT;
+ ke->fflags = NOTE_NSECONDS;
+ ke->data = timeout_ns;
+ } else {
+ ke->filter = EVFILT_READ;
+ }
+ ke->ident = ih->fd;
+
+ return 0;
+}
+
+int
+rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb, void *cb_arg)
+{
+ struct rte_intr_callback *callback;
+ struct rte_intr_source *src;
+ int ret, add_event = 0;
+
+ /* first do parameter checking */
+ if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
+ RTE_LOG(ERR, EAL,
+ "Registering with invalid input parameter\n");
+ return -EINVAL;
+ }
+ if (kq < 0) {
+ RTE_LOG(ERR, EAL, "Kqueue is not active: %d\n", kq);
+ return -ENODEV;
+ }
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* find the source for this intr_handle */
+ TAILQ_FOREACH(src, &intr_sources, next) {
+ if (src->intr_handle.fd == intr_handle->fd)
+ break;
+ }
+
+ /* if this is an alarm interrupt and it already has a callback,
+ * then we don't want to create a new callback because the only
+ * thing on the list should be eal_alarm_callback() and we may
+ * be called just to reset the timer.
+ */
+ if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM &&
+ !TAILQ_EMPTY(&src->callbacks)) {
+ callback = NULL;
+ } else {
+ /* allocate a new interrupt callback entity */
+ callback = calloc(1, sizeof(*callback));
+ if (callback == NULL) {
+ RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+ ret = -ENOMEM;
+ goto fail;
+ }
+ callback->cb_fn = cb;
+ callback->cb_arg = cb_arg;
+ callback->pending_delete = 0;
+ callback->ucb_fn = NULL;
+
+ if (src == NULL) {
+ src = calloc(1, sizeof(*src));
+ if (src == NULL) {
+ RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+ ret = -ENOMEM;
+ goto fail;
+ } else {
+ src->intr_handle = *intr_handle;
+ TAILQ_INIT(&src->callbacks);
+ TAILQ_INSERT_TAIL(&intr_sources, src, next);
+ }
+ }
+
+ /* we had no interrupts for this */
+ if (TAILQ_EMPTY(&src->callbacks))
+ add_event = 1;
+
+ TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+ }
+
+ /* add events to the queue. timer events are special as we need to
+ * re-set the timer.
+ */
+ if (add_event || src->intr_handle.type == RTE_INTR_HANDLE_ALARM) {
+ struct kevent ke;
+
+ memset(&ke, 0, sizeof(ke));
+ ke.flags = EV_ADD; /* mark for addition to the queue */
+
+ if (intr_source_to_kevent(intr_handle, &ke) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot convert interrupt handle to kevent\n");
+ ret = -ENODEV;
+ goto fail;
+ }
+
+ /**
+ * add the intr file descriptor into wait list.
+ */
+ if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
+ /* currently, nic_uio does not support interrupts, so
+ * this error will always be triggered and output to the
+ * user. so, don't output it unless debug log level set.
+ */
+ if (errno == ENODEV)
+ RTE_LOG(DEBUG, EAL, "Interrupt handle %d not supported\n",
+ src->intr_handle.fd);
+ else
+ RTE_LOG(ERR, EAL, "Error adding fd %d "
+ "kevent, %s\n",
+ src->intr_handle.fd,
+ strerror(errno));
+ ret = -errno;
+ goto fail;
+ }
+ }
+ rte_spinlock_unlock(&intr_lock);
+
+ return 0;
+fail:
+ /* clean up */
+ if (src != NULL) {
+ if (callback != NULL)
+ TAILQ_REMOVE(&(src->callbacks), callback, next);
+ if (TAILQ_EMPTY(&(src->callbacks))) {
+ TAILQ_REMOVE(&intr_sources, src, next);
+ free(src);
+ }
+ }
+ free(callback);
+ rte_spinlock_unlock(&intr_lock);
+ return ret;
+}
+
+int
+rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg,
+ rte_intr_unregister_callback_fn ucb_fn)
+{
+ int ret;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *cb, *next;
+
+ /* do parameter checking first */
+ if (intr_handle == NULL || intr_handle->fd < 0) {
+ RTE_LOG(ERR, EAL,
+ "Unregistering with invalid input parameter\n");
+ return -EINVAL;
+ }
+
+ if (kq < 0) {
+ RTE_LOG(ERR, EAL, "Kqueue is not active\n");
+ return -ENODEV;
+ }
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* check if the insterrupt source for the fd is existent */
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd == intr_handle->fd)
+ break;
+
+ /* No interrupt source registered for the fd */
+ if (src == NULL) {
+ ret = -ENOENT;
+
+ /* only usable if the source is active */
+ } else if (src->active == 0) {
+ ret = -EAGAIN;
+
+ } else {
+ ret = 0;
+
+ /* walk through the callbacks and mark all that match. */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+ cb->cb_arg == cb_arg)) {
+ cb->pending_delete = 1;
+ cb->ucb_fn = ucb_fn;
+ ret++;
+ }
+ }
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+
+ return ret;
+}
+
+int
+rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+ int ret;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *cb, *next;
+
+ /* do parameter checking first */
+ if (intr_handle == NULL || intr_handle->fd < 0) {
+ RTE_LOG(ERR, EAL,
+ "Unregistering with invalid input parameter\n");
+ return -EINVAL;
+ }
+ if (kq < 0) {
+ RTE_LOG(ERR, EAL, "Kqueue is not active\n");
+ return -ENODEV;
+ }
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* check if the insterrupt source for the fd is existent */
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd == intr_handle->fd)
+ break;
+
+ /* No interrupt source registered for the fd */
+ if (src == NULL) {
+ ret = -ENOENT;
+
+ /* interrupt source has some active callbacks right now. */
+ } else if (src->active != 0) {
+ ret = -EAGAIN;
+
+ /* ok to remove. */
+ } else {
+ struct kevent ke;
+
+ ret = 0;
+
+ /* remove it from the kqueue */
+ memset(&ke, 0, sizeof(ke));
+ ke.flags = EV_DELETE; /* mark for deletion from the queue */
+
+ if (intr_source_to_kevent(intr_handle, &ke) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot convert to kevent\n");
+ ret = -ENODEV;
+ goto out;
+ }
+
+ /**
+ * remove intr file descriptor from wait list.
+ */
+ if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Error removing fd %d kevent, %s\n",
+ src->intr_handle.fd, strerror(errno));
+ /* removing non-existent even is an expected condition
+ * in some circumstances (e.g. oneshot events).
+ */
+ }
+
+ /*walk through the callbacks and remove all that match. */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+ cb->cb_arg == cb_arg)) {
+ TAILQ_REMOVE(&src->callbacks, cb, next);
+ free(cb);
+ ret++;
+ }
+ }
+
+ /* all callbacks for that source are removed. */
+ if (TAILQ_EMPTY(&src->callbacks)) {
+ TAILQ_REMOVE(&intr_sources, src, next);
+ free(src);
+ }
+ }
+out:
+ rte_spinlock_unlock(&intr_lock);
+
+ return ret;
+}
+
+int
+rte_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+ return -1;
+
+ switch (intr_handle->type) {
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_ALARM:
+ return -1;
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ return -1;
+ /* unknown handle type */
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+rte_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+ return -1;
+
+ switch (intr_handle->type) {
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_ALARM:
+ return -1;
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ return -1;
+ /* unknown handle type */
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+rte_intr_ack(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ return -1;
+}
+
+static void
+eal_intr_process_interrupts(struct kevent *events, int nfds)
+{
+ struct rte_intr_callback active_cb;
+ union rte_intr_read_buffer buf;
+ struct rte_intr_callback *cb, *next;
+ struct rte_intr_source *src;
+ bool call = false;
+ int n, bytes_read;
+ struct kevent ke;
+
+ for (n = 0; n < nfds; n++) {
+ int event_fd = events[n].ident;
+
+ rte_spinlock_lock(&intr_lock);
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd == event_fd)
+ break;
+ if (src == NULL) {
+ rte_spinlock_unlock(&intr_lock);
+ continue;
+ }
+
+ /* mark this interrupt source as active and release the lock. */
+ src->active = 1;
+ rte_spinlock_unlock(&intr_lock);
+
+ /* set the length to be read dor different handle type */
+ switch (src->intr_handle.type) {
+ case RTE_INTR_HANDLE_ALARM:
+ bytes_read = 0;
+ call = true;
+ break;
+ case RTE_INTR_HANDLE_VDEV:
+ case RTE_INTR_HANDLE_EXT:
+ bytes_read = 0;
+ call = true;
+ break;
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ bytes_read = 0;
+ call = true;
+ break;
+ default:
+ bytes_read = 1;
+ break;
+ }
+
+ if (bytes_read > 0) {
+ /**
+ * read out to clear the ready-to-be-read flag
+ * for epoll_wait.
+ */
+ bytes_read = read(event_fd, &buf, bytes_read);
+ if (bytes_read < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK)
+ continue;
+
+ RTE_LOG(ERR, EAL, "Error reading from file "
+ "descriptor %d: %s\n",
+ event_fd,
+ strerror(errno));
+ } else if (bytes_read == 0)
+ RTE_LOG(ERR, EAL, "Read nothing from file "
+ "descriptor %d\n", event_fd);
+ else
+ call = true;
+ }
+
+ /* grab a lock, again to call callbacks and update status. */
+ rte_spinlock_lock(&intr_lock);
+
+ if (call) {
+ /* Finally, call all callbacks. */
+ TAILQ_FOREACH(cb, &src->callbacks, next) {
+
+ /* make a copy and unlock. */
+ active_cb = *cb;
+ rte_spinlock_unlock(&intr_lock);
+
+ /* call the actual callback */
+ active_cb.cb_fn(active_cb.cb_arg);
+
+ /*get the lock back. */
+ rte_spinlock_lock(&intr_lock);
+ }
+ }
+
+ /* we done with that interrupt source, release it. */
+ src->active = 0;
+
+ /* check if any callback are supposed to be removed */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ if (cb->pending_delete) {
+ /* remove it from the kqueue */
+ memset(&ke, 0, sizeof(ke));
+ /* mark for deletion from the queue */
+ ke.flags = EV_DELETE;
+
+ if (intr_source_to_kevent(&src->intr_handle, &ke) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot convert to kevent\n");
+ rte_spinlock_unlock(&intr_lock);
+ return;
+ }
+
+ /**
+ * remove intr file descriptor from wait list.
+ */
+ if (kevent(kq, &ke, 1, NULL, 0, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Error removing fd %d kevent, "
+ "%s\n", src->intr_handle.fd,
+ strerror(errno));
+ /* removing non-existent even is an expected
+ * condition in some circumstances
+ * (e.g. oneshot events).
+ */
+ }
+
+ TAILQ_REMOVE(&src->callbacks, cb, next);
+ if (cb->ucb_fn)
+ cb->ucb_fn(&src->intr_handle, cb->cb_arg);
+ free(cb);
+ }
+ }
+
+ /* all callbacks for that source are removed. */
+ if (TAILQ_EMPTY(&src->callbacks)) {
+ TAILQ_REMOVE(&intr_sources, src, next);
+ free(src);
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+ }
+}
+
+static void *
+eal_intr_thread_main(void *arg __rte_unused)
+{
+ struct kevent events[MAX_INTR_EVENTS];
+ int nfds;
+
+ /* host thread, never break out */
+ for (;;) {
+ /* do not change anything, just wait */
+ nfds = kevent(kq, NULL, 0, events, MAX_INTR_EVENTS, NULL);
+
+ /* kevent fail */
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ RTE_LOG(ERR, EAL,
+ "kevent returns with fail\n");
+ break;
+ }
+ /* kevent timeout, will never happen here */
+ else if (nfds == 0)
+ continue;
+
+ /* kevent has at least one fd ready to read */
+ eal_intr_process_interrupts(events, nfds);
+ }
+ close(kq);
+ kq = -1;
+ return NULL;
+}
+
+int
+rte_eal_intr_init(void)
+{
+ int ret = 0;
+
+ /* init the global interrupt source head */
+ TAILQ_INIT(&intr_sources);
+
+ kq = kqueue();
+ if (kq < 0) {
+ RTE_LOG(ERR, EAL, "Cannot create kqueue instance\n");
+ return -1;
+ }
+
+ /* create the host thread to wait/handle the interrupt */
+ ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
+ eal_intr_thread_main, NULL);
+ if (ret != 0) {
+ rte_errno = -ret;
+ RTE_LOG(ERR, EAL,
+ "Failed to create thread for interrupt handling\n");
+ }
+
+ return ret;
+}
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle,
+ int epfd, int op, unsigned int vec, void *data)
+{
+ RTE_SET_USED(intr_handle);
+ RTE_SET_USED(epfd);
+ RTE_SET_USED(op);
+ RTE_SET_USED(vec);
+ RTE_SET_USED(data);
+
+ return -ENOTSUP;
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+ RTE_SET_USED(intr_handle);
+ RTE_SET_USED(nb_efd);
+
+ return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+ RTE_SET_USED(intr_handle);
+}
+
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+ RTE_SET_USED(intr_handle);
+ return 0;
+}
+
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+ RTE_SET_USED(intr_handle);
+ return 1;
+}
+
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
+{
+ RTE_SET_USED(intr_handle);
+ return 0;
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+ int maxevents, int timeout)
+{
+ RTE_SET_USED(epfd);
+ RTE_SET_USED(events);
+ RTE_SET_USED(maxevents);
+ RTE_SET_USED(timeout);
+
+ return -ENOTSUP;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd, struct rte_epoll_event *event)
+{
+ RTE_SET_USED(epfd);
+ RTE_SET_USED(op);
+ RTE_SET_USED(fd);
+ RTE_SET_USED(event);
+
+ return -ENOTSUP;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+ return -ENOTSUP;
+}
+
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
+{
+ RTE_SET_USED(intr_handle);
+}
+
+int rte_thread_is_intr(void)
+{
+ return pthread_equal(intr_thread, pthread_self());
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <sys/sysctl.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+/* No topology information available on FreeBSD including NUMA info */
+unsigned
+eal_cpu_core_id(__rte_unused unsigned lcore_id)
+{
+ return 0;
+}
+
+static int
+eal_get_ncpus(void)
+{
+ static int ncpu = -1;
+ int mib[2] = {CTL_HW, HW_NCPU};
+ size_t len = sizeof(ncpu);
+
+ if (ncpu < 0) {
+ sysctl(mib, 2, &ncpu, &len, NULL, 0);
+ RTE_LOG(INFO, EAL, "Sysctl reports %d cpus\n", ncpu);
+ }
+ return ncpu;
+}
+
+unsigned
+eal_cpu_socket_id(__rte_unused unsigned cpu_id)
+{
+ return 0;
+}
+
+/* Check if a cpu is present by the presence of the
+ * cpu information for it.
+ */
+int
+eal_cpu_detected(unsigned lcore_id)
+{
+ const unsigned ncpus = eal_get_ncpus();
+ return lcore_id < ncpus;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+
+#include "eal_memalloc.h"
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms __rte_unused,
+ int __rte_unused n_segs, size_t __rte_unused page_sz,
+ int __rte_unused socket, bool __rte_unused exact)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t __rte_unused page_sz, int __rte_unused socket)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return NULL;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms __rte_unused)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms __rte_unused,
+ int n_segs __rte_unused)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ RTE_LOG(ERR, EAL, "Memory hotplug not supported on FreeBSD\n");
+ return -1;
+}
+
+int
+eal_memalloc_get_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx __rte_unused, int seg_idx __rte_unused,
+ int fd __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_set_seg_list_fd(int list_idx __rte_unused, int fd __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx __rte_unused,
+ int seg_idx __rte_unused, size_t *offset __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+int
+eal_memalloc_init(void)
+{
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_memcfg.h"
+#include "eal_options.h"
+
+#define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE))
+
+uint64_t eal_get_baseaddr(void)
+{
+ /*
+ * FreeBSD may allocate something in the space we will be mapping things
+ * before we get a chance to do that, so use a base address that's far
+ * away from where malloc() et al usually map things.
+ */
+ return 0x1000000000ULL;
+}
+
+/*
+ * Get physical address of any mapped virtual address in the current process.
+ */
+phys_addr_t
+rte_mem_virt2phy(const void *virtaddr)
+{
+ /* XXX not implemented. This function is only used by
+ * rte_mempool_virt2iova() when hugepages are disabled. */
+ (void)virtaddr;
+ return RTE_BAD_IOVA;
+}
+rte_iova_t
+rte_mem_virt2iova(const void *virtaddr)
+{
+ return rte_mem_virt2phy(virtaddr);
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+ struct rte_mem_config *mcfg;
+ uint64_t total_mem = 0;
+ void *addr;
+ unsigned int i, j, seg_idx = 0;
+
+ /* get pointer to global configuration */
+ mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* for debug purposes, hugetlbfs can be disabled */
+ if (internal_config.no_hugetlbfs) {
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
+ uint64_t page_sz;
+ int n_segs, cur_seg;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ addr = mmap(NULL, internal_config.memory,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+ strerror(errno));
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->len = internal_config.memory;
+ msl->socket_id = 0;
+ msl->heap = 1;
+
+ /* populate memsegs. each memseg is 1 page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &msl->memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = page_sz;
+ ms->socket_id = 0;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, page_sz);
+ }
+ return 0;
+ }
+
+ /* map all hugepages and sort them */
+ for (i = 0; i < internal_config.num_hugepage_sizes; i ++){
+ struct hugepage_info *hpi;
+ rte_iova_t prev_end = 0;
+ int prev_ms_idx = -1;
+ uint64_t page_sz, mem_needed;
+ unsigned int n_pages, max_pages;
+
+ hpi = &internal_config.hugepage_info[i];
+ page_sz = hpi->hugepage_sz;
+ max_pages = hpi->num_pages[0];
+ mem_needed = RTE_ALIGN_CEIL(internal_config.memory - total_mem,
+ page_sz);
+
+ n_pages = RTE_MIN(mem_needed / page_sz, max_pages);
+
+ for (j = 0; j < n_pages; j++) {
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ struct rte_memseg *seg;
+ int msl_idx, ms_idx;
+ rte_iova_t physaddr;
+ int error;
+ size_t sysctl_size = sizeof(physaddr);
+ char physaddr_str[64];
+ bool is_adjacent;
+
+ /* first, check if this segment is IOVA-adjacent to
+ * the previous one.
+ */
+ snprintf(physaddr_str, sizeof(physaddr_str),
+ "hw.contigmem.physaddr.%d", j);
+ error = sysctlbyname(physaddr_str, &physaddr,
+ &sysctl_size, NULL, 0);
+ if (error < 0) {
+ RTE_LOG(ERR, EAL, "Failed to get physical addr for buffer %u "
+ "from %s\n", j, hpi->hugedir);
+ return -1;
+ }
+
+ is_adjacent = prev_end != 0 && physaddr == prev_end;
+ prev_end = physaddr + hpi->hugepage_sz;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ bool empty, need_hole;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+
+ empty = arr->count == 0;
+
+ /* we need a hole if this isn't an empty memseg
+ * list, and if previous segment was not
+ * adjacent to current one.
+ */
+ need_hole = !empty && !is_adjacent;
+
+ /* we need 1, plus hole if not adjacent */
+ ms_idx = rte_fbarray_find_next_n_free(arr,
+ 0, 1 + (need_hole ? 1 : 0));
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ if (need_hole && prev_ms_idx == ms_idx - 1)
+ ms_idx++;
+ prev_ms_idx = ms_idx;
+
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+ arr = &msl->memseg_arr;
+ seg = rte_fbarray_get(arr, ms_idx);
+
+ addr = RTE_PTR_ADD(msl->base_va,
+ (size_t)msl->page_sz * ms_idx);
+
+ /* address is already mapped in memseg list, so using
+ * MAP_FIXED here is safe.
+ */
+ addr = mmap(addr, page_sz, PROT_READ|PROT_WRITE,
+ MAP_SHARED | MAP_FIXED,
+ hpi->lock_descriptor,
+ j * EAL_PAGE_SIZE);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
+ j, hpi->hugedir);
+ return -1;
+ }
+
+ seg->addr = addr;
+ seg->iova = physaddr;
+ seg->hugepage_sz = page_sz;
+ seg->len = page_sz;
+ seg->nchannel = mcfg->nchannel;
+ seg->nrank = mcfg->nrank;
+ seg->socket_id = 0;
+
+ rte_fbarray_set_used(arr, ms_idx);
+
+ RTE_LOG(INFO, EAL, "Mapped memory segment %u @ %p: physaddr:0x%"
+ PRIx64", len %zu\n",
+ seg_idx++, addr, physaddr, page_sz);
+
+ total_mem += seg->len;
+ }
+ if (total_mem >= internal_config.memory)
+ break;
+ }
+ if (total_mem < internal_config.memory) {
+ RTE_LOG(ERR, EAL, "Couldn't reserve requested memory, "
+ "requested: %" PRIu64 "M "
+ "available: %" PRIu64 "M\n",
+ internal_config.memory >> 20, total_mem >> 20);
+ return -1;
+ }
+ return 0;
+}
+
+struct attach_walk_args {
+ int fd_hugepage;
+ int seg_idx;
+};
+static int
+attach_segment(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
+{
+ struct attach_walk_args *wa = arg;
+ void *addr;
+
+ if (msl->external)
+ return 0;
+
+ addr = mmap(ms->addr, ms->len, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, wa->fd_hugepage,
+ wa->seg_idx * EAL_PAGE_SIZE);
+ if (addr == MAP_FAILED || addr != ms->addr)
+ return -1;
+ wa->seg_idx++;
+
+ return 0;
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ const struct hugepage_info *hpi;
+ int fd_hugepage = -1;
+ unsigned int i;
+
+ hpi = &internal_config.hugepage_info[0];
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ const struct hugepage_info *cur_hpi = &hpi[i];
+ struct attach_walk_args wa;
+
+ memset(&wa, 0, sizeof(wa));
+
+ /* Obtain a file descriptor for contiguous memory */
+ fd_hugepage = open(cur_hpi->hugedir, O_RDWR);
+ if (fd_hugepage < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ cur_hpi->hugedir);
+ goto error;
+ }
+ wa.fd_hugepage = fd_hugepage;
+ wa.seg_idx = 0;
+
+ /* Map the contiguous memory into each memory segment */
+ if (rte_memseg_walk(attach_segment, &wa) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to mmap buffer %u from %s\n",
+ wa.seg_idx, cur_hpi->hugedir);
+ goto error;
+ }
+
+ close(fd_hugepage);
+ fd_hugepage = -1;
+ }
+
+ /* hugepage_info is no longer required */
+ return 0;
+
+error:
+ if (fd_hugepage >= 0)
+ close(fd_hugepage);
+ return -1;
+}
+
+int
+rte_eal_using_phys_addrs(void)
+{
+ return 0;
+}
+
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+ uint64_t area_sz, max_pages;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+ max_pages = RTE_MAX_MEMSEG_PER_LIST;
+ max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+ area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return RTE_ALIGN(area_sz, page_sz);
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int n_segs, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+#ifdef RTE_ARCH_PPC_64
+ flags |= MAP_HUGETLB;
+#endif
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - "
+ "please use '--" OPT_BASE_VIRTADDR "' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->len = mem_sz;
+
+ return 0;
+}
+
+
+static int
+memseg_primary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+ uint64_t max_mem, total_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /* FreeBSD has an issue where core dump will dump the entire memory
+ * contents, including anonymous zero-page memory. Therefore, while we
+ * will be limiting total amount of memory to RTE_MAX_MEM_MB, we will
+ * also be further limiting total memory amount to whatever memory is
+ * available to us through contigmem driver (plus spacing blocks).
+ *
+ * so, at each stage, we will be checking how much memory we are
+ * preallocating, and adjust all the values accordingly.
+ */
+
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ total_mem = 0;
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ uint64_t max_type_mem, total_type_mem = 0;
+ uint64_t avail_mem;
+ int type_msl_idx, max_segs, avail_segs, total_segs = 0;
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ /* no NUMA support on FreeBSD */
+
+ /* check if we've already exceeded total memory amount */
+ if (total_mem >= max_mem)
+ break;
+
+ /* first, calculate theoretical limits according to config */
+ max_type_mem = RTE_MIN(max_mem - total_mem,
+ (uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20);
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+
+ /* now, limit all of that to whatever will actually be
+ * available to us, because without dynamic allocation support,
+ * all of that extra memory will be sitting there being useless
+ * and slowing down core dumps in case of a crash.
+ *
+ * we need (N*2)-1 segments because we cannot guarantee that
+ * each segment will be IOVA-contiguous with the previous one,
+ * so we will allocate more and put spaces inbetween segments
+ * that are non-contiguous.
+ */
+ avail_segs = (hpi->num_pages[0] * 2) - 1;
+ avail_mem = avail_segs * hugepage_sz;
+
+ max_type_mem = RTE_MIN(avail_mem, max_type_mem);
+ max_segs = RTE_MIN(avail_segs, max_segs);
+
+ type_msl_idx = 0;
+ while (total_type_mem < max_type_mem &&
+ total_segs < max_segs) {
+ uint64_t cur_max_mem, cur_mem;
+ unsigned int n_segs;
+
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx++];
+
+ cur_max_mem = max_type_mem - total_type_mem;
+
+ cur_mem = get_mem_amount(hugepage_sz,
+ cur_max_mem);
+ n_segs = cur_mem / hugepage_sz;
+
+ if (alloc_memseg_list(msl, hugepage_sz, n_segs,
+ 0, type_msl_idx))
+ return -1;
+
+ total_segs += msl->memseg_arr.len;
+ total_type_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ return -1;
+ }
+ }
+ total_mem += total_type_mem;
+ }
+ return 0;
+}
+
+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+ return rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ memseg_primary_init() :
+ memseg_secondary_init();
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sched.h>
+#include <pthread_np.h>
+#include <sys/queue.h>
+#include <sys/thr.h>
+
+#include <rte_debug.h>
+#include <rte_atomic.h>
+#include <rte_launch.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
+RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
+RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
+
+/*
+ * Send a message to a slave lcore identified by slave_id to call a
+ * function f with argument arg. Once the execution is done, the
+ * remote lcore switch in FINISHED state.
+ */
+int
+rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
+{
+ int n;
+ char c = 0;
+ int m2s = lcore_config[slave_id].pipe_master2slave[1];
+ int s2m = lcore_config[slave_id].pipe_slave2master[0];
+
+ if (lcore_config[slave_id].state != WAIT)
+ return -EBUSY;
+
+ lcore_config[slave_id].f = f;
+ lcore_config[slave_id].arg = arg;
+
+ /* send message */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = write(m2s, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ /* wait ack */
+ do {
+ n = read(s2m, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ return 0;
+}
+
+/* set affinity for current thread */
+static int
+eal_thread_set_affinity(void)
+{
+ unsigned lcore_id = rte_lcore_id();
+
+ /* acquire system unique id */
+ rte_gettid();
+
+ /* update EAL thread core affinity */
+ return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
+}
+
+void eal_thread_init_master(unsigned lcore_id)
+{
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+ /* set CPU affinity */
+ if (eal_thread_set_affinity() < 0)
+ rte_panic("cannot set affinity\n");
+}
+
+/* main loop of threads */
+__attribute__((noreturn)) void *
+eal_thread_loop(__attribute__((unused)) void *arg)
+{
+ char c;
+ int n, ret;
+ unsigned lcore_id;
+ pthread_t thread_id;
+ int m2s, s2m;
+ char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+
+ thread_id = pthread_self();
+
+ /* retrieve our lcore_id from the configuration structure */
+ RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+ if (thread_id == lcore_config[lcore_id].thread_id)
+ break;
+ }
+ if (lcore_id == RTE_MAX_LCORE)
+ rte_panic("cannot retrieve lcore id\n");
+
+ m2s = lcore_config[lcore_id].pipe_master2slave[0];
+ s2m = lcore_config[lcore_id].pipe_slave2master[1];
+
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+ /* set CPU affinity */
+ if (eal_thread_set_affinity() < 0)
+ rte_panic("cannot set affinity\n");
+
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%p;cpuset=[%s%s])\n",
+ lcore_id, thread_id, cpuset, ret == 0 ? "" : "...");
+
+ /* read on our pipe to get commands */
+ while (1) {
+ void *fct_arg;
+
+ /* wait command */
+ do {
+ n = read(m2s, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ lcore_config[lcore_id].state = RUNNING;
+
+ /* send ack */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = write(s2m, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ if (lcore_config[lcore_id].f == NULL)
+ rte_panic("NULL function pointer\n");
+
+ /* call the function and store the return value */
+ fct_arg = lcore_config[lcore_id].arg;
+ ret = lcore_config[lcore_id].f(fct_arg);
+ lcore_config[lcore_id].ret = ret;
+ rte_wmb();
+ lcore_config[lcore_id].state = FINISHED;
+ }
+
+ /* never reached */
+ /* pthread_exit(NULL); */
+ /* return NULL; */
+}
+
+/* require calling thread tid by gettid() */
+int rte_sys_gettid(void)
+{
+ long lwpid;
+ thr_self(&lwpid);
+ return (int)lwpid;
+}
+
+int rte_thread_setname(pthread_t id, const char *name)
+{
+ /* this BSD function returns no error */
+ pthread_set_name_np(id, name);
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_cycles.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+#ifdef RTE_LIBEAL_USE_HPET
+#warning HPET is not supported in FreeBSD
+#endif
+
+enum timer_source eal_timer_source = EAL_TIMER_TSC;
+
+uint64_t
+get_tsc_freq(void)
+{
+ size_t sz;
+ int tmp;
+ uint64_t tsc_hz;
+
+ sz = sizeof(tmp);
+ tmp = 0;
+
+ if (sysctlbyname("kern.timecounter.smp_tsc", &tmp, &sz, NULL, 0))
+ RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
+ else if (tmp != 1)
+ RTE_LOG(WARNING, EAL, "TSC is not safe to use in SMP mode\n");
+
+ tmp = 0;
+
+ if (sysctlbyname("kern.timecounter.invariant_tsc", &tmp, &sz, NULL, 0))
+ RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
+ else if (tmp != 1)
+ RTE_LOG(WARNING, EAL, "TSC is not invariant\n");
+
+ sz = sizeof(tsc_hz);
+ if (sysctlbyname("machdep.tsc_freq", &tsc_hz, &sz, NULL, 0)) {
+ RTE_LOG(WARNING, EAL, "%s\n", strerror(errno));
+ return 0;
+ }
+
+ return tsc_hz;
+}
+
+int
+rte_eal_timer_init(void)
+{
+ set_tsc_freq();
+ return 0;
+}
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020 Mellanox Technologies, Ltd
+
+includes += include_directories('.')
+
+headers += files(
+ 'rte_os.h',
+)
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2019 Intel Corporation
+ */
+
+#ifndef _RTE_OS_H_
+#define _RTE_OS_H_
+
+/**
+ * This is header should contain any function/macro definition
+ * which are not supported natively or named differently in the
+ * freebsd OS. Functions will be added in future releases.
+ */
+
+#include <pthread_np.h>
+
+typedef cpuset_t rte_cpuset_t;
+#define RTE_CPU_AND(dst, src1, src2) do \
+{ \
+ cpuset_t tmp; \
+ CPU_COPY(src1, &tmp); \
+ CPU_AND(&tmp, src2); \
+ CPU_COPY(&tmp, dst); \
+} while (0)
+#define RTE_CPU_OR(dst, src1, src2) do \
+{ \
+ cpuset_t tmp; \
+ CPU_COPY(src1, &tmp); \
+ CPU_OR(&tmp, src2); \
+ CPU_COPY(&tmp, dst); \
+} while (0)
+#define RTE_CPU_FILL(set) CPU_FILL(set)
+
+/* In FreeBSD 13 CPU_NAND macro is CPU_ANDNOT */
+#ifdef CPU_NAND
+#define RTE_CPU_NOT(dst, src) do \
+{ \
+ cpuset_t tmp; \
+ CPU_FILL(&tmp); \
+ CPU_NAND(&tmp, src); \
+ CPU_COPY(&tmp, dst); \
+} while (0)
+#else
+#define RTE_CPU_NOT(dst, src) do \
+{ \
+ cpuset_t tmp; \
+ CPU_FILL(&tmp); \
+ CPU_ANDNOT(&tmp, src); \
+ CPU_COPY(&tmp, dst); \
+} while (0)
+#endif
+
+#endif /* _RTE_OS_H_ */
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+subdir('include')
+
+sources += files('eal_alarm.c',
+ 'eal_cpuflags.c',
+ 'eal_debug.c',
+ 'eal_hugepage_info.c',
+ 'eal_interrupts.c',
+ 'eal_lcore.c',
+ 'eal_memalloc.c',
+ 'eal_thread.c',
+ 'eal_timer.c',
+ 'eal.c',
+ 'eal_memory.c',
+ 'eal_dev.c'
+)
+
+deps += ['kvargs']
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2019 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+LIB = librte_eal.a
+
+ARCH_DIR ?= $(RTE_ARCH)
+
+EXPORT_MAP := ../rte_eal_version.map
+VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR)
+
+VPATH += $(RTE_SDK)/lib/librte_eal/common
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -I$(SRCDIR)/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include
+CFLAGS += $(WERROR_FLAGS) -O3
+
+LDLIBS += -ldl
+LDLIBS += -lpthread
+LDLIBS += -lgcc_s
+LDLIBS += -lrt
+LDLIBS += -lrte_kvargs
+ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
+LDLIBS += -lnuma
+endif
+
+# specific to linux exec-env
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) := eal.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_hugepage_info.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_interrupts.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_alarm.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_dev.c
+
+# from common dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memzone.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_mcfg.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_tailqs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_errno.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hypervisor.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_string_fns.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hexdump.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_devargs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_class.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_bus.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_dev.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_options.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_fbarray.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_uuid.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += hotplug_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_elem.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_option.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_service.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_random.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
+
+# from arch dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_hypervisor.c
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
+SRCS-y += rte_cycles.c
+
+CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
+
+# workaround for a gcc bug with noreturn attribute
+# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
+ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
+CFLAGS_eal_thread.o += -Wno-return-type
+endif
+
+INC := rte_kni_common.h
+INC += rte_os.h
+
+SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUX)-include := $(addprefix include/,$(INC))
+
+include $(RTE_SDK)/mk/rte.lib.mk
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation.
+ * Copyright(c) 2012-2014 6WIND S.A.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <syslog.h>
+#include <getopt.h>
+#include <sys/file.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <stddef.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+#include <linux/version.h>
+
+#include <rte_compat.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_service_component.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_interrupts.h>
+#include <rte_bus.h>
+#include <rte_dev.h>
+#include <rte_devargs.h>
+#include <rte_version.h>
+#include <rte_atomic.h>
+#include <malloc_heap.h>
+#include <rte_vfio.h>
+#include <rte_option.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+#include "eal_memcfg.h"
+#include "eal_options.h"
+#include "eal_vfio.h"
+#include "hotplug_mp.h"
+
+#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
+
+#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
+
+#define KERNEL_IOMMU_GROUPS_PATH "/sys/kernel/iommu_groups"
+
+/* Allow the application to print its usage message too if set */
+static rte_usage_hook_t rte_application_usage_hook = NULL;
+
+/* early configuration structure, when memory config is not mmapped */
+static struct rte_mem_config early_mem_config;
+
+/* define fd variable here, because file needs to be kept open for the
+ * duration of the program, as we hold a write lock on it in the primary proc */
+static int mem_cfg_fd = -1;
+
+static struct flock wr_lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
+};
+
+/* Address of global and public configuration */
+static struct rte_config rte_config = {
+ .mem_config = &early_mem_config,
+};
+
+/* internal configuration (per-core) */
+struct lcore_config lcore_config[RTE_MAX_LCORE];
+
+/* internal configuration */
+struct internal_config internal_config;
+
+/* used by rte_rdtsc() */
+int rte_cycles_vmware_tsc_map;
+
+/* platform-specific runtime dir */
+static char runtime_dir[PATH_MAX];
+
+static const char *default_runtime_dir = "/var/run";
+
+int
+eal_create_runtime_dir(void)
+{
+ const char *directory = default_runtime_dir;
+ const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
+ const char *fallback = "/tmp";
+ char tmp[PATH_MAX];
+ int ret;
+
+ if (getuid() != 0) {
+ /* try XDG path first, fall back to /tmp */
+ if (xdg_runtime_dir != NULL)
+ directory = xdg_runtime_dir;
+ else
+ directory = fallback;
+ }
+ /* create DPDK subdirectory under runtime dir */
+ ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
+ if (ret < 0 || ret == sizeof(tmp)) {
+ RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
+ return -1;
+ }
+
+ /* create prefix-specific subdirectory under DPDK runtime dir */
+ ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
+ tmp, eal_get_hugefile_prefix());
+ if (ret < 0 || ret == sizeof(runtime_dir)) {
+ RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
+ return -1;
+ }
+
+ /* create the path if it doesn't exist. no "mkdir -p" here, so do it
+ * step by step.
+ */
+ ret = mkdir(tmp, 0700);
+ if (ret < 0 && errno != EEXIST) {
+ RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+ tmp, strerror(errno));
+ return -1;
+ }
+
+ ret = mkdir(runtime_dir, 0700);
+ if (ret < 0 && errno != EEXIST) {
+ RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+ runtime_dir, strerror(errno));
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+eal_clean_runtime_dir(void)
+{
+ DIR *dir;
+ struct dirent *dirent;
+ int dir_fd, fd, lck_result;
+ static const char * const filters[] = {
+ "fbarray_*",
+ "mp_socket_*"
+ };
+
+ /* open directory */
+ dir = opendir(runtime_dir);
+ if (!dir) {
+ RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n",
+ runtime_dir);
+ goto error;
+ }
+ dir_fd = dirfd(dir);
+
+ /* lock the directory before doing anything, to avoid races */
+ if (flock(dir_fd, LOCK_EX) < 0) {
+ RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n",
+ runtime_dir);
+ goto error;
+ }
+
+ dirent = readdir(dir);
+ if (!dirent) {
+ RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n",
+ runtime_dir);
+ goto error;
+ }
+
+ while (dirent != NULL) {
+ unsigned int f_idx;
+ bool skip = true;
+
+ /* skip files that don't match the patterns */
+ for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) {
+ const char *filter = filters[f_idx];
+
+ if (fnmatch(filter, dirent->d_name, 0) == 0) {
+ skip = false;
+ break;
+ }
+ }
+ if (skip) {
+ dirent = readdir(dir);
+ continue;
+ }
+
+ /* try and lock the file */
+ fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+
+ /* skip to next file */
+ if (fd == -1) {
+ dirent = readdir(dir);
+ continue;
+ }
+
+ /* non-blocking lock */
+ lck_result = flock(fd, LOCK_EX | LOCK_NB);
+
+ /* if lock succeeds, remove the file */
+ if (lck_result != -1)
+ unlinkat(dir_fd, dirent->d_name, 0);
+ close(fd);
+ dirent = readdir(dir);
+ }
+
+ /* closedir closes dir_fd and drops the lock */
+ closedir(dir);
+ return 0;
+
+error:
+ if (dir)
+ closedir(dir);
+
+ RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n",
+ strerror(errno));
+
+ return -1;
+}
+
+const char *
+rte_eal_get_runtime_dir(void)
+{
+ return runtime_dir;
+}
+
+/* Return user provided mbuf pool ops name */
+const char *
+rte_eal_mbuf_user_pool_ops(void)
+{
+ return internal_config.user_mbuf_pool_ops_name;
+}
+
+/* Return a pointer to the configuration structure */
+struct rte_config *
+rte_eal_get_configuration(void)
+{
+ return &rte_config;
+}
+
+enum rte_iova_mode
+rte_eal_iova_mode(void)
+{
+ return rte_eal_get_configuration()->iova_mode;
+}
+
+/* parse a sysfs (or other) file containing one integer value */
+int
+eal_parse_sysfs_value(const char *filename, unsigned long *val)
+{
+ FILE *f;
+ char buf[BUFSIZ];
+ char *end = NULL;
+
+ if ((f = fopen(filename, "r")) == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
+ __func__, filename);
+ return -1;
+ }
+
+ if (fgets(buf, sizeof(buf), f) == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
+ __func__, filename);
+ fclose(f);
+ return -1;
+ }
+ *val = strtoul(buf, &end, 0);
+ if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
+ RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
+ __func__, filename);
+ fclose(f);
+ return -1;
+ }
+ fclose(f);
+ return 0;
+}
+
+
+/* create memory configuration in shared/mmap memory. Take out
+ * a write lock on the memsegs, so we can auto-detect primary/secondary.
+ * This means we never close the file while running (auto-close on exit).
+ * We also don't lock the whole file, so that in future we can use read-locks
+ * on other parts, e.g. memzones, to detect if there are running secondary
+ * processes. */
+static int
+rte_eal_config_create(void)
+{
+ size_t page_sz = sysconf(_SC_PAGE_SIZE);
+ size_t cfg_len = sizeof(*rte_config.mem_config);
+ size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
+ void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
+ int retval;
+
+ const char *pathname = eal_runtime_config_path();
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ /* map the config before hugepage address so that we don't waste a page */
+ if (internal_config.base_virtaddr != 0)
+ rte_mem_cfg_addr = (void *)
+ RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
+ sizeof(struct rte_mem_config), page_sz);
+ else
+ rte_mem_cfg_addr = NULL;
+
+ if (mem_cfg_fd < 0){
+ mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
+ if (mem_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+ }
+
+ retval = ftruncate(mem_cfg_fd, cfg_len);
+ if (retval < 0){
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+
+ retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
+ if (retval < 0){
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
+ "process running?\n", pathname);
+ return -1;
+ }
+
+ /* reserve space for config */
+ rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
+ &cfg_len_aligned, page_sz, 0, 0);
+ if (rte_mem_cfg_addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ return -1;
+ }
+
+ /* remap the actual file into the space we've just reserved */
+ mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
+ cfg_len_aligned, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
+ if (mapped_mem_cfg_addr == MAP_FAILED) {
+ munmap(rte_mem_cfg_addr, cfg_len);
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
+ return -1;
+ }
+
+ memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
+ rte_config.mem_config = rte_mem_cfg_addr;
+
+ /* store address of the config in the config itself so that secondary
+ * processes could later map the config into this exact location */
+ rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+
+ rte_config.mem_config->dma_maskbits = 0;
+
+ return 0;
+}
+
+/* attach to an existing shared memory config */
+static int
+rte_eal_config_attach(void)
+{
+ struct rte_mem_config *mem_config;
+
+ const char *pathname = eal_runtime_config_path();
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ if (mem_cfg_fd < 0){
+ mem_cfg_fd = open(pathname, O_RDWR);
+ if (mem_cfg_fd < 0) {
+ RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
+ pathname);
+ return -1;
+ }
+ }
+
+ /* map it as read-only first */
+ mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
+ PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
+ if (mem_config == MAP_FAILED) {
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ rte_config.mem_config = mem_config;
+
+ return 0;
+}
+
+/* reattach the shared config at exact memory location primary process has it */
+static int
+rte_eal_config_reattach(void)
+{
+ struct rte_mem_config *mem_config;
+ void *rte_mem_cfg_addr;
+
+ if (internal_config.no_shconf)
+ return 0;
+
+ /* save the address primary process has mapped shared config to */
+ rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
+
+ /* unmap original config */
+ munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
+
+ /* remap the config at proper address */
+ mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
+ sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
+ mem_cfg_fd, 0);
+
+ close(mem_cfg_fd);
+ mem_cfg_fd = -1;
+
+ if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
+ if (mem_config != MAP_FAILED) {
+ /* errno is stale, don't use */
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]"
+ " - please use '--" OPT_BASE_VIRTADDR
+ "' option\n", rte_mem_cfg_addr, mem_config);
+ munmap(mem_config, sizeof(struct rte_mem_config));
+ return -1;
+ }
+ RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ rte_config.mem_config = mem_config;
+
+ return 0;
+}
+
+/* Detect if we are a primary or a secondary process */
+enum rte_proc_type_t
+eal_proc_type_detect(void)
+{
+ enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
+ const char *pathname = eal_runtime_config_path();
+
+ /* if there no shared config, there can be no secondary processes */
+ if (!internal_config.no_shconf) {
+ /* if we can open the file but not get a write-lock we are a
+ * secondary process. NOTE: if we get a file handle back, we
+ * keep that open and don't close it to prevent a race condition
+ * between multiple opens.
+ */
+ if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
+ (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
+ ptype = RTE_PROC_SECONDARY;
+ }
+
+ RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
+ ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
+
+ return ptype;
+}
+
+/* Sets up rte_config structure with the pointer to shared memory config.*/
+static int
+rte_config_init(void)
+{
+ rte_config.process_type = internal_config.process_type;
+
+ switch (rte_config.process_type){
+ case RTE_PROC_PRIMARY:
+ if (rte_eal_config_create() < 0)
+ return -1;
+ eal_mcfg_update_from_internal();
+ break;
+ case RTE_PROC_SECONDARY:
+ if (rte_eal_config_attach() < 0)
+ return -1;
+ eal_mcfg_wait_complete();
+ if (eal_mcfg_check_version() < 0) {
+ RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
+ return -1;
+ }
+ if (rte_eal_config_reattach() < 0)
+ return -1;
+ eal_mcfg_update_internal();
+ break;
+ case RTE_PROC_AUTO:
+ case RTE_PROC_INVALID:
+ RTE_LOG(ERR, EAL, "Invalid process type %d\n",
+ rte_config.process_type);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
+static void
+eal_hugedirs_unlock(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
+ {
+ /* skip uninitialized */
+ if (internal_config.hugepage_info[i].lock_descriptor < 0)
+ continue;
+ /* unlock hugepage file */
+ flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN);
+ close(internal_config.hugepage_info[i].lock_descriptor);
+ /* reset the field */
+ internal_config.hugepage_info[i].lock_descriptor = -1;
+ }
+}
+
+/* display usage */
+static void
+eal_usage(const char *prgname)
+{
+ printf("\nUsage: %s ", prgname);
+ eal_common_usage();
+ printf("EAL Linux options:\n"
+ " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n"
+ " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n"
+ " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n"
+ " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n"
+ " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
+ " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
+ " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
+ " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
+ " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n"
+ "\n");
+ /* Allow the application to print its usage message too if hook is set */
+ if ( rte_application_usage_hook ) {
+ printf("===== Application Usage =====\n\n");
+ rte_application_usage_hook(prgname);
+ }
+}
+
+/* Set a per-application usage message */
+rte_usage_hook_t
+rte_set_application_usage_hook( rte_usage_hook_t usage_func )
+{
+ rte_usage_hook_t old_func;
+
+ /* Will be NULL on the first call to denote the last usage routine. */
+ old_func = rte_application_usage_hook;
+ rte_application_usage_hook = usage_func;
+
+ return old_func;
+}
+
+static int
+eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg)
+{
+ char * arg[RTE_MAX_NUMA_NODES];
+ char *end;
+ int arg_num, i, len;
+ uint64_t total_mem = 0;
+
+ len = strnlen(strval, SOCKET_MEM_STRLEN);
+ if (len == SOCKET_MEM_STRLEN) {
+ RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
+ return -1;
+ }
+
+ /* all other error cases will be caught later */
+ if (!isdigit(strval[len-1]))
+ return -1;
+
+ /* split the optarg into separate socket values */
+ arg_num = rte_strsplit(strval, len,
+ arg, RTE_MAX_NUMA_NODES, ',');
+
+ /* if split failed, or 0 arguments */
+ if (arg_num <= 0)
+ return -1;
+
+ /* parse each defined socket option */
+ errno = 0;
+ for (i = 0; i < arg_num; i++) {
+ uint64_t val;
+ end = NULL;
+ val = strtoull(arg[i], &end, 10);
+
+ /* check for invalid input */
+ if ((errno != 0) ||
+ (arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
+ return -1;
+ val <<= 20;
+ total_mem += val;
+ socket_arg[i] = val;
+ }
+
+ return 0;
+}
+
+static int
+eal_parse_vfio_intr(const char *mode)
+{
+ unsigned i;
+ static struct {
+ const char *name;
+ enum rte_intr_mode value;
+ } map[] = {
+ { "legacy", RTE_INTR_MODE_LEGACY },
+ { "msi", RTE_INTR_MODE_MSI },
+ { "msix", RTE_INTR_MODE_MSIX },
+ };
+
+ for (i = 0; i < RTE_DIM(map); i++) {
+ if (!strcmp(mode, map[i].name)) {
+ internal_config.vfio_intr_mode = map[i].value;
+ return 0;
+ }
+ }
+ return -1;
+}
+
+/* Parse the arguments for --log-level only */
+static void
+eal_log_level_parse(int argc, char **argv)
+{
+ int opt;
+ char **argvopt;
+ int option_index;
+ const int old_optind = optind;
+ const int old_optopt = optopt;
+ char * const old_optarg = optarg;
+
+ argvopt = argv;
+ optind = 1;
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ int ret;
+
+ /* getopt is not happy, stop right now */
+ if (opt == '?')
+ break;
+
+ ret = (opt == OPT_LOG_LEVEL_NUM) ?
+ eal_parse_common_option(opt, optarg, &internal_config) : 0;
+
+ /* common parser is not happy */
+ if (ret < 0)
+ break;
+ }
+
+ /* restore getopt lib */
+ optind = old_optind;
+ optopt = old_optopt;
+ optarg = old_optarg;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+eal_parse_args(int argc, char **argv)
+{
+ int opt, ret;
+ char **argvopt;
+ int option_index;
+ char *prgname = argv[0];
+ const int old_optind = optind;
+ const int old_optopt = optopt;
+ char * const old_optarg = optarg;
+
+ argvopt = argv;
+ optind = 1;
+ opterr = 0;
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ /*
+ * getopt didn't recognise the option, lets parse the
+ * registered options to see if the flag is valid
+ */
+ if (opt == '?') {
+ ret = rte_option_parse(argv[optind-1]);
+ if (ret == 0)
+ continue;
+
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+
+ ret = eal_parse_common_option(opt, optarg, &internal_config);
+ /* common parser is not happy */
+ if (ret < 0) {
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ /* common parser handled this option */
+ if (ret == 0)
+ continue;
+
+ switch (opt) {
+ case 'h':
+ eal_usage(prgname);
+ exit(EXIT_SUCCESS);
+
+ case OPT_HUGE_DIR_NUM:
+ {
+ char *hdir = strdup(optarg);
+ if (hdir == NULL)
+ RTE_LOG(ERR, EAL, "Could not store hugepage directory\n");
+ else {
+ /* free old hugepage dir */
+ if (internal_config.hugepage_dir != NULL)
+ free(internal_config.hugepage_dir);
+ internal_config.hugepage_dir = hdir;
+ }
+ break;
+ }
+ case OPT_FILE_PREFIX_NUM:
+ {
+ char *prefix = strdup(optarg);
+ if (prefix == NULL)
+ RTE_LOG(ERR, EAL, "Could not store file prefix\n");
+ else {
+ /* free old prefix */
+ if (internal_config.hugefile_prefix != NULL)
+ free(internal_config.hugefile_prefix);
+ internal_config.hugefile_prefix = prefix;
+ }
+ break;
+ }
+ case OPT_SOCKET_MEM_NUM:
+ if (eal_parse_socket_arg(optarg,
+ internal_config.socket_mem) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_SOCKET_MEM "\n");
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ internal_config.force_sockets = 1;
+ break;
+
+ case OPT_SOCKET_LIMIT_NUM:
+ if (eal_parse_socket_arg(optarg,
+ internal_config.socket_limit) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_SOCKET_LIMIT "\n");
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ internal_config.force_socket_limits = 1;
+ break;
+
+ case OPT_VFIO_INTR_NUM:
+ if (eal_parse_vfio_intr(optarg) < 0) {
+ RTE_LOG(ERR, EAL, "invalid parameters for --"
+ OPT_VFIO_INTR "\n");
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ break;
+
+ case OPT_CREATE_UIO_DEV_NUM:
+ internal_config.create_uio_dev = 1;
+ break;
+
+ case OPT_MBUF_POOL_OPS_NAME_NUM:
+ {
+ char *ops_name = strdup(optarg);
+ if (ops_name == NULL)
+ RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
+ else {
+ /* free old ops name */
+ if (internal_config.user_mbuf_pool_ops_name !=
+ NULL)
+ free(internal_config.user_mbuf_pool_ops_name);
+
+ internal_config.user_mbuf_pool_ops_name =
+ ops_name;
+ }
+ break;
+ }
+ case OPT_MATCH_ALLOCATIONS_NUM:
+ internal_config.match_allocations = 1;
+ break;
+
+ default:
+ if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
+ RTE_LOG(ERR, EAL, "Option %c is not supported "
+ "on Linux\n", opt);
+ } else if (opt >= OPT_LONG_MIN_NUM &&
+ opt < OPT_LONG_MAX_NUM) {
+ RTE_LOG(ERR, EAL, "Option %s is not supported "
+ "on Linux\n",
+ eal_long_options[option_index].name);
+ } else {
+ RTE_LOG(ERR, EAL, "Option %d is not supported "
+ "on Linux\n", opt);
+ }
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* create runtime data directory */
+ if (internal_config.no_shconf == 0 &&
+ eal_create_runtime_dir() < 0) {
+ RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (eal_adjust_config(&internal_config) != 0) {
+ ret = -1;
+ goto out;
+ }
+
+ /* sanity checks */
+ if (eal_check_common_options(&internal_config) != 0) {
+ eal_usage(prgname);
+ ret = -1;
+ goto out;
+ }
+
+ if (optind >= 0)
+ argv[optind-1] = prgname;
+ ret = optind-1;
+
+out:
+ /* restore getopt lib */
+ optind = old_optind;
+ optopt = old_optopt;
+ optarg = old_optarg;
+
+ return ret;
+}
+
+static int
+check_socket(const struct rte_memseg_list *msl, void *arg)
+{
+ int *socket_id = arg;
+
+ if (msl->external)
+ return 0;
+
+ return *socket_id == msl->socket_id;
+}
+
+static void
+eal_check_mem_on_local_socket(void)
+{
+ int socket_id;
+
+ socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
+
+ if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
+ RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
+}
+
+static int
+sync_func(__attribute__((unused)) void *arg)
+{
+ return 0;
+}
+
+/*
+ * Request iopl privilege for all RPL, returns 0 on success
+ * iopl() call is mostly for the i386 architecture. For other architectures,
+ * return -1 to indicate IO privilege can't be changed in this way.
+ */
+int
+rte_eal_iopl_init(void)
+{
+#if defined(RTE_ARCH_X86)
+ if (iopl(3) != 0)
+ return -1;
+#endif
+ return 0;
+}
+
+#ifdef VFIO_PRESENT
+static int rte_eal_vfio_setup(void)
+{
+ if (rte_vfio_enable("vfio"))
+ return -1;
+
+ return 0;
+}
+#endif
+
+static void rte_eal_init_alert(const char *msg)
+{
+ fprintf(stderr, "EAL: FATAL: %s\n", msg);
+ RTE_LOG(ERR, EAL, "%s\n", msg);
+}
+
+/*
+ * On Linux 3.6+, even if VFIO is not loaded, whenever IOMMU is enabled in the
+ * BIOS and in the kernel, /sys/kernel/iommu_groups path will contain kernel
+ * IOMMU groups. If IOMMU is not enabled, that path would be empty.
+ * Therefore, checking if the path is empty will tell us if IOMMU is enabled.
+ */
+static bool
+is_iommu_enabled(void)
+{
+ DIR *dir = opendir(KERNEL_IOMMU_GROUPS_PATH);
+ struct dirent *d;
+ int n = 0;
+
+ /* if directory doesn't exist, assume IOMMU is not enabled */
+ if (dir == NULL)
+ return false;
+
+ while ((d = readdir(dir)) != NULL) {
+ /* skip dot and dot-dot */
+ if (++n > 2)
+ break;
+ }
+ closedir(dir);
+
+ return n > 2;
+}
+
+/* Launch threads, called at application init(). */
+int
+rte_eal_init(int argc, char **argv)
+{
+ int i, fctret, ret;
+ pthread_t thread_id;
+ static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
+ const char *p;
+ static char logid[PATH_MAX];
+ char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+ char thread_name[RTE_MAX_THREAD_NAME_LEN];
+ bool phys_addrs;
+
+ /* checks if the machine is adequate */
+ if (!rte_cpu_is_supported()) {
+ rte_eal_init_alert("unsupported cpu type.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ if (!rte_atomic32_test_and_set(&run_once)) {
+ rte_eal_init_alert("already called initialization.");
+ rte_errno = EALREADY;
+ return -1;
+ }
+
+ p = strrchr(argv[0], '/');
+ strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
+ thread_id = pthread_self();
+
+ eal_reset_internal_config(&internal_config);
+
+ /* set log level as early as possible */
+ eal_log_level_parse(argc, argv);
+
+ if (rte_eal_cpu_init() < 0) {
+ rte_eal_init_alert("Cannot detect lcores.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ fctret = eal_parse_args(argc, argv);
+ if (fctret < 0) {
+ rte_eal_init_alert("Invalid 'command line' arguments.");
+ rte_errno = EINVAL;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ if (eal_plugins_init() < 0) {
+ rte_eal_init_alert("Cannot init plugins");
+ rte_errno = EINVAL;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ if (eal_option_device_parse()) {
+ rte_errno = ENODEV;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ if (rte_config_init() < 0) {
+ rte_eal_init_alert("Cannot init config");
+ return -1;
+ }
+
+ if (rte_eal_intr_init() < 0) {
+ rte_eal_init_alert("Cannot init interrupt-handling thread");
+ return -1;
+ }
+
+ if (rte_eal_alarm_init() < 0) {
+ rte_eal_init_alert("Cannot init alarm");
+ /* rte_eal_alarm_init sets rte_errno on failure. */
+ return -1;
+ }
+
+ /* Put mp channel init before bus scan so that we can init the vdev
+ * bus through mp channel in the secondary process before the bus scan.
+ */
+ if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
+ rte_eal_init_alert("failed to init mp channel");
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ rte_errno = EFAULT;
+ return -1;
+ }
+ }
+
+ /* register multi-process action callbacks for hotplug */
+ if (eal_mp_dev_hotplug_init() < 0) {
+ rte_eal_init_alert("failed to register mp callback for hotplug");
+ return -1;
+ }
+
+ if (rte_bus_scan()) {
+ rte_eal_init_alert("Cannot scan the buses for devices");
+ rte_errno = ENODEV;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+ phys_addrs = rte_eal_using_phys_addrs() != 0;
+
+ /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+ if (internal_config.iova_mode == RTE_IOVA_DC) {
+ /* autodetect the IOVA mapping mode */
+ enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
+
+ if (iova_mode == RTE_IOVA_DC) {
+ RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n");
+
+ if (!phys_addrs) {
+ /* if we have no access to physical addresses,
+ * pick IOVA as VA mode.
+ */
+ iova_mode = RTE_IOVA_VA;
+ RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n");
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
+ } else if (rte_eal_check_module("rte_kni") == 1) {
+ iova_mode = RTE_IOVA_PA;
+ RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n");
+#endif
+ } else if (is_iommu_enabled()) {
+ /* we have an IOMMU, pick IOVA as VA mode */
+ iova_mode = RTE_IOVA_VA;
+ RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n");
+ } else {
+ /* physical addresses available, and no IOMMU
+ * found, so pick IOVA as PA.
+ */
+ iova_mode = RTE_IOVA_PA;
+ RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
+ }
+ }
+#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
+ /* Workaround for KNI which requires physical address to work
+ * in kernels < 4.10
+ */
+ if (iova_mode == RTE_IOVA_VA &&
+ rte_eal_check_module("rte_kni") == 1) {
+ if (phys_addrs) {
+ iova_mode = RTE_IOVA_PA;
+ RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
+ } else {
+ RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
+ }
+ }
+#endif
+ rte_eal_get_configuration()->iova_mode = iova_mode;
+ } else {
+ rte_eal_get_configuration()->iova_mode =
+ internal_config.iova_mode;
+ }
+
+ if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
+ rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
+ rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
+
+ if (internal_config.no_hugetlbfs == 0) {
+ /* rte_config isn't initialized yet */
+ ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+ eal_hugepage_info_init() :
+ eal_hugepage_info_read();
+ if (ret < 0) {
+ rte_eal_init_alert("Cannot get hugepage information.");
+ rte_errno = EACCES;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+ }
+
+ if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
+ if (internal_config.no_hugetlbfs)
+ internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
+ }
+
+ if (internal_config.vmware_tsc_map == 1) {
+#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
+ rte_cycles_vmware_tsc_map = 1;
+ RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
+ "you must have monitor_control.pseudo_perfctr = TRUE\n");
+#else
+ RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
+ "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
+#endif
+ }
+
+ if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
+ rte_eal_init_alert("Cannot init logging.");
+ rte_errno = ENOMEM;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+
+#ifdef VFIO_PRESENT
+ if (rte_eal_vfio_setup() < 0) {
+ rte_eal_init_alert("Cannot init VFIO");
+ rte_errno = EAGAIN;
+ rte_atomic32_clear(&run_once);
+ return -1;
+ }
+#endif
+ /* in secondary processes, memory init may allocate additional fbarrays
+ * not present in primary processes, so to avoid any potential issues,
+ * initialize memzones first.
+ */
+ if (rte_eal_memzone_init() < 0) {
+ rte_eal_init_alert("Cannot init memzone");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_memory_init() < 0) {
+ rte_eal_init_alert("Cannot init memory");
+ rte_errno = ENOMEM;
+ return -1;
+ }
+
+ /* the directories are locked during eal_hugepage_info_init */
+ eal_hugedirs_unlock();
+
+ if (rte_eal_malloc_heap_init() < 0) {
+ rte_eal_init_alert("Cannot init malloc heap");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (rte_eal_tailqs_init() < 0) {
+ rte_eal_init_alert("Cannot init tail queues for objects");
+ rte_errno = EFAULT;
+ return -1;
+ }
+
+ if (rte_eal_timer_init() < 0) {
+ rte_eal_init_alert("Cannot init HPET or TSC timers");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ eal_check_mem_on_local_socket();
+
+ eal_thread_init_master(rte_config.master_lcore);
+
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+ RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
+ ret == 0 ? "" : "...");
+
+ RTE_LCORE_FOREACH_SLAVE(i) {
+
+ /*
+ * create communication pipes between master thread
+ * and children
+ */
+ if (pipe(lcore_config[i].pipe_master2slave) < 0)
+ rte_panic("Cannot create pipe\n");
+ if (pipe(lcore_config[i].pipe_slave2master) < 0)
+ rte_panic("Cannot create pipe\n");
+
+ lcore_config[i].state = WAIT;
+
+ /* create a thread for each lcore */
+ ret = pthread_create(&lcore_config[i].thread_id, NULL,
+ eal_thread_loop, NULL);
+ if (ret != 0)
+ rte_panic("Cannot create thread\n");
+
+ /* Set thread_name for aid in debugging. */
+ snprintf(thread_name, sizeof(thread_name),
+ "lcore-slave-%d", i);
+ ret = rte_thread_setname(lcore_config[i].thread_id,
+ thread_name);
+ if (ret != 0)
+ RTE_LOG(DEBUG, EAL,
+ "Cannot set name for lcore thread\n");
+ }
+
+ /*
+ * Launch a dummy function on all slave lcores, so that master lcore
+ * knows they are all ready when this function returns.
+ */
+ rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
+ rte_eal_mp_wait_lcore();
+
+ /* initialize services so vdevs register service during bus_probe. */
+ ret = rte_service_init();
+ if (ret) {
+ rte_eal_init_alert("rte_service_init() failed");
+ rte_errno = ENOEXEC;
+ return -1;
+ }
+
+ /* Probe all the buses and devices/drivers on them */
+ if (rte_bus_probe()) {
+ rte_eal_init_alert("Cannot probe devices");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+#ifdef VFIO_PRESENT
+ /* Register mp action after probe() so that we got enough info */
+ if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+ return -1;
+#endif
+
+ /* initialize default service/lcore mappings and start running. Ignore
+ * -ENOTSUP, as it indicates no service coremask passed to EAL.
+ */
+ ret = rte_service_start_with_defaults();
+ if (ret < 0 && ret != -ENOTSUP) {
+ rte_errno = ENOEXEC;
+ return -1;
+ }
+
+ /*
+ * Clean up unused files in runtime directory. We do this at the end of
+ * init and not at the beginning because we want to clean stuff up
+ * whether we are primary or secondary process, but we cannot remove
+ * primary process' files because secondary should be able to run even
+ * if primary process is dead.
+ *
+ * In no_shconf mode, no runtime directory is created in the first
+ * place, so no cleanup needed.
+ */
+ if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
+ rte_eal_init_alert("Cannot clear runtime directory\n");
+ return -1;
+ }
+
+ eal_mcfg_complete();
+
+ /* Call each registered callback, if enabled */
+ rte_option_init();
+
+ return fctret;
+}
+
+static int
+mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg __rte_unused)
+{
+ /* ms is const, so find this memseg */
+ struct rte_memseg *found;
+
+ if (msl->external)
+ return 0;
+
+ found = rte_mem_virt2memseg(ms->addr, msl);
+
+ found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
+
+ return 0;
+}
+
+int
+rte_eal_cleanup(void)
+{
+ /* if we're in a primary process, we need to mark hugepages as freeable
+ * so that finalization can release them back to the system.
+ */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ rte_memseg_walk(mark_freeable, NULL);
+ rte_service_finalize();
+ rte_mp_channel_cleanup();
+ eal_cleanup_config(&internal_config);
+ return 0;
+}
+
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+ return rte_config.process_type;
+}
+
+int rte_eal_has_hugepages(void)
+{
+ return ! internal_config.no_hugetlbfs;
+}
+
+int rte_eal_has_pci(void)
+{
+ return !internal_config.no_pci;
+}
+
+int rte_eal_create_uio_dev(void)
+{
+ return internal_config.create_uio_dev;
+}
+
+enum rte_intr_mode
+rte_eal_vfio_intr_mode(void)
+{
+ return internal_config.vfio_intr_mode;
+}
+
+int
+rte_eal_check_module(const char *module_name)
+{
+ char sysfs_mod_name[PATH_MAX];
+ struct stat st;
+ int n;
+
+ if (NULL == module_name)
+ return -1;
+
+ /* Check if there is sysfs mounted */
+ if (stat("/sys/module", &st) != 0) {
+ RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ /* A module might be built-in, therefore try sysfs */
+ n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name);
+ if (n < 0 || n > PATH_MAX) {
+ RTE_LOG(DEBUG, EAL, "Could not format module path\n");
+ return -1;
+ }
+
+ if (stat(sysfs_mod_name, &st) != 0) {
+ RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n",
+ sysfs_mod_name, errno, strerror(errno));
+ return 0;
+ }
+
+ /* Module has been found */
+ return 1;
+}
+++ /dev/null
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2019 Intel Corporation
-
-include $(RTE_SDK)/mk/rte.vars.mk
-
-LIB = librte_eal.a
-
-ARCH_DIR ?= $(RTE_ARCH)
-
-EXPORT_MAP := ../../rte_eal_version.map
-VPATH += $(RTE_SDK)/lib/librte_eal/$(ARCH_DIR)
-
-VPATH += $(RTE_SDK)/lib/librte_eal/common
-
-CFLAGS += -DALLOW_EXPERIMENTAL_API
-CFLAGS += -I$(SRCDIR)/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/include
-CFLAGS += $(WERROR_FLAGS) -O3
-
-LDLIBS += -ldl
-LDLIBS += -lpthread
-LDLIBS += -lgcc_s
-LDLIBS += -lrt
-LDLIBS += -lrte_kvargs
-ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
-LDLIBS += -lnuma
-endif
-
-# specific to linux exec-env
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) := eal.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_hugepage_info.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_log.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_vfio_mp_sync.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_debug.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_interrupts.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_alarm.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_dev.c
-
-# from common dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memzone.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_log.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_launch.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_mcfg.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_tailqs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_errno.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hypervisor.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_string_fns.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_hexdump.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_devargs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_class.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_bus.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_dev.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_options.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_proc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_fbarray.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += eal_common_uuid.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_malloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += hotplug_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_elem.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_heap.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += malloc_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_keepalive.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_option.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_service.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_random.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_reciprocal.c
-
-# from arch dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUX) += rte_hypervisor.c
-SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
-SRCS-y += rte_cycles.c
-
-CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-
-# workaround for a gcc bug with noreturn attribute
-# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
-ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
-CFLAGS_eal_thread.o += -Wno-return-type
-endif
-
-INC := rte_kni_common.h
-INC += rte_os.h
-
-SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUX)-include := $(addprefix include/,$(INC))
-
-include $(RTE_SDK)/mk/rte.lib.mk
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation.
- * Copyright(c) 2012-2014 6WIND S.A.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <syslog.h>
-#include <getopt.h>
-#include <sys/file.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <fnmatch.h>
-#include <stddef.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <sys/stat.h>
-#if defined(RTE_ARCH_X86)
-#include <sys/io.h>
-#endif
-#include <linux/version.h>
-
-#include <rte_compat.h>
-#include <rte_common.h>
-#include <rte_debug.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_errno.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_service_component.h>
-#include <rte_log.h>
-#include <rte_random.h>
-#include <rte_cycles.h>
-#include <rte_string_fns.h>
-#include <rte_cpuflags.h>
-#include <rte_interrupts.h>
-#include <rte_bus.h>
-#include <rte_dev.h>
-#include <rte_devargs.h>
-#include <rte_version.h>
-#include <rte_atomic.h>
-#include <malloc_heap.h>
-#include <rte_vfio.h>
-#include <rte_option.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_hugepages.h"
-#include "eal_memcfg.h"
-#include "eal_options.h"
-#include "eal_vfio.h"
-#include "hotplug_mp.h"
-
-#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
-
-#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
-
-#define KERNEL_IOMMU_GROUPS_PATH "/sys/kernel/iommu_groups"
-
-/* Allow the application to print its usage message too if set */
-static rte_usage_hook_t rte_application_usage_hook = NULL;
-
-/* early configuration structure, when memory config is not mmapped */
-static struct rte_mem_config early_mem_config;
-
-/* define fd variable here, because file needs to be kept open for the
- * duration of the program, as we hold a write lock on it in the primary proc */
-static int mem_cfg_fd = -1;
-
-static struct flock wr_lock = {
- .l_type = F_WRLCK,
- .l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memsegs),
- .l_len = sizeof(early_mem_config.memsegs),
-};
-
-/* Address of global and public configuration */
-static struct rte_config rte_config = {
- .mem_config = &early_mem_config,
-};
-
-/* internal configuration (per-core) */
-struct lcore_config lcore_config[RTE_MAX_LCORE];
-
-/* internal configuration */
-struct internal_config internal_config;
-
-/* used by rte_rdtsc() */
-int rte_cycles_vmware_tsc_map;
-
-/* platform-specific runtime dir */
-static char runtime_dir[PATH_MAX];
-
-static const char *default_runtime_dir = "/var/run";
-
-int
-eal_create_runtime_dir(void)
-{
- const char *directory = default_runtime_dir;
- const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
- const char *fallback = "/tmp";
- char tmp[PATH_MAX];
- int ret;
-
- if (getuid() != 0) {
- /* try XDG path first, fall back to /tmp */
- if (xdg_runtime_dir != NULL)
- directory = xdg_runtime_dir;
- else
- directory = fallback;
- }
- /* create DPDK subdirectory under runtime dir */
- ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
- if (ret < 0 || ret == sizeof(tmp)) {
- RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
- return -1;
- }
-
- /* create prefix-specific subdirectory under DPDK runtime dir */
- ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
- tmp, eal_get_hugefile_prefix());
- if (ret < 0 || ret == sizeof(runtime_dir)) {
- RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
- return -1;
- }
-
- /* create the path if it doesn't exist. no "mkdir -p" here, so do it
- * step by step.
- */
- ret = mkdir(tmp, 0700);
- if (ret < 0 && errno != EEXIST) {
- RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
- tmp, strerror(errno));
- return -1;
- }
-
- ret = mkdir(runtime_dir, 0700);
- if (ret < 0 && errno != EEXIST) {
- RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
- runtime_dir, strerror(errno));
- return -1;
- }
-
- return 0;
-}
-
-int
-eal_clean_runtime_dir(void)
-{
- DIR *dir;
- struct dirent *dirent;
- int dir_fd, fd, lck_result;
- static const char * const filters[] = {
- "fbarray_*",
- "mp_socket_*"
- };
-
- /* open directory */
- dir = opendir(runtime_dir);
- if (!dir) {
- RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n",
- runtime_dir);
- goto error;
- }
- dir_fd = dirfd(dir);
-
- /* lock the directory before doing anything, to avoid races */
- if (flock(dir_fd, LOCK_EX) < 0) {
- RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n",
- runtime_dir);
- goto error;
- }
-
- dirent = readdir(dir);
- if (!dirent) {
- RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n",
- runtime_dir);
- goto error;
- }
-
- while (dirent != NULL) {
- unsigned int f_idx;
- bool skip = true;
-
- /* skip files that don't match the patterns */
- for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) {
- const char *filter = filters[f_idx];
-
- if (fnmatch(filter, dirent->d_name, 0) == 0) {
- skip = false;
- break;
- }
- }
- if (skip) {
- dirent = readdir(dir);
- continue;
- }
-
- /* try and lock the file */
- fd = openat(dir_fd, dirent->d_name, O_RDONLY);
-
- /* skip to next file */
- if (fd == -1) {
- dirent = readdir(dir);
- continue;
- }
-
- /* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
-
- /* if lock succeeds, remove the file */
- if (lck_result != -1)
- unlinkat(dir_fd, dirent->d_name, 0);
- close(fd);
- dirent = readdir(dir);
- }
-
- /* closedir closes dir_fd and drops the lock */
- closedir(dir);
- return 0;
-
-error:
- if (dir)
- closedir(dir);
-
- RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n",
- strerror(errno));
-
- return -1;
-}
-
-const char *
-rte_eal_get_runtime_dir(void)
-{
- return runtime_dir;
-}
-
-/* Return user provided mbuf pool ops name */
-const char *
-rte_eal_mbuf_user_pool_ops(void)
-{
- return internal_config.user_mbuf_pool_ops_name;
-}
-
-/* Return a pointer to the configuration structure */
-struct rte_config *
-rte_eal_get_configuration(void)
-{
- return &rte_config;
-}
-
-enum rte_iova_mode
-rte_eal_iova_mode(void)
-{
- return rte_eal_get_configuration()->iova_mode;
-}
-
-/* parse a sysfs (or other) file containing one integer value */
-int
-eal_parse_sysfs_value(const char *filename, unsigned long *val)
-{
- FILE *f;
- char buf[BUFSIZ];
- char *end = NULL;
-
- if ((f = fopen(filename, "r")) == NULL) {
- RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
- __func__, filename);
- return -1;
- }
-
- if (fgets(buf, sizeof(buf), f) == NULL) {
- RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
- __func__, filename);
- fclose(f);
- return -1;
- }
- *val = strtoul(buf, &end, 0);
- if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
- RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
- __func__, filename);
- fclose(f);
- return -1;
- }
- fclose(f);
- return 0;
-}
-
-
-/* create memory configuration in shared/mmap memory. Take out
- * a write lock on the memsegs, so we can auto-detect primary/secondary.
- * This means we never close the file while running (auto-close on exit).
- * We also don't lock the whole file, so that in future we can use read-locks
- * on other parts, e.g. memzones, to detect if there are running secondary
- * processes. */
-static int
-rte_eal_config_create(void)
-{
- size_t page_sz = sysconf(_SC_PAGE_SIZE);
- size_t cfg_len = sizeof(*rte_config.mem_config);
- size_t cfg_len_aligned = RTE_ALIGN(cfg_len, page_sz);
- void *rte_mem_cfg_addr, *mapped_mem_cfg_addr;
- int retval;
-
- const char *pathname = eal_runtime_config_path();
-
- if (internal_config.no_shconf)
- return 0;
-
- /* map the config before hugepage address so that we don't waste a page */
- if (internal_config.base_virtaddr != 0)
- rte_mem_cfg_addr = (void *)
- RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
- sizeof(struct rte_mem_config), page_sz);
- else
- rte_mem_cfg_addr = NULL;
-
- if (mem_cfg_fd < 0){
- mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0600);
- if (mem_cfg_fd < 0) {
- RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
- }
-
- retval = ftruncate(mem_cfg_fd, cfg_len);
- if (retval < 0){
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot resize '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
-
- retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
- if (retval < 0){
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot create lock on '%s'. Is another primary "
- "process running?\n", pathname);
- return -1;
- }
-
- /* reserve space for config */
- rte_mem_cfg_addr = eal_get_virtual_area(rte_mem_cfg_addr,
- &cfg_len_aligned, page_sz, 0, 0);
- if (rte_mem_cfg_addr == NULL) {
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config\n");
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- return -1;
- }
-
- /* remap the actual file into the space we've just reserved */
- mapped_mem_cfg_addr = mmap(rte_mem_cfg_addr,
- cfg_len_aligned, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, mem_cfg_fd, 0);
- if (mapped_mem_cfg_addr == MAP_FAILED) {
- munmap(rte_mem_cfg_addr, cfg_len);
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot remap memory for rte_config\n");
- return -1;
- }
-
- memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
- rte_config.mem_config = rte_mem_cfg_addr;
-
- /* store address of the config in the config itself so that secondary
- * processes could later map the config into this exact location */
- rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
-
- rte_config.mem_config->dma_maskbits = 0;
-
- return 0;
-}
-
-/* attach to an existing shared memory config */
-static int
-rte_eal_config_attach(void)
-{
- struct rte_mem_config *mem_config;
-
- const char *pathname = eal_runtime_config_path();
-
- if (internal_config.no_shconf)
- return 0;
-
- if (mem_cfg_fd < 0){
- mem_cfg_fd = open(pathname, O_RDWR);
- if (mem_cfg_fd < 0) {
- RTE_LOG(ERR, EAL, "Cannot open '%s' for rte_mem_config\n",
- pathname);
- return -1;
- }
- }
-
- /* map it as read-only first */
- mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
- PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
- if (mem_config == MAP_FAILED) {
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- rte_config.mem_config = mem_config;
-
- return 0;
-}
-
-/* reattach the shared config at exact memory location primary process has it */
-static int
-rte_eal_config_reattach(void)
-{
- struct rte_mem_config *mem_config;
- void *rte_mem_cfg_addr;
-
- if (internal_config.no_shconf)
- return 0;
-
- /* save the address primary process has mapped shared config to */
- rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
-
- /* unmap original config */
- munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
-
- /* remap the config at proper address */
- mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
- sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
- mem_cfg_fd, 0);
-
- close(mem_cfg_fd);
- mem_cfg_fd = -1;
-
- if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
- if (mem_config != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config at [%p], got [%p]"
- " - please use '--" OPT_BASE_VIRTADDR
- "' option\n", rte_mem_cfg_addr, mem_config);
- munmap(mem_config, sizeof(struct rte_mem_config));
- return -1;
- }
- RTE_LOG(ERR, EAL, "Cannot mmap memory for rte_config! error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- rte_config.mem_config = mem_config;
-
- return 0;
-}
-
-/* Detect if we are a primary or a secondary process */
-enum rte_proc_type_t
-eal_proc_type_detect(void)
-{
- enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
- const char *pathname = eal_runtime_config_path();
-
- /* if there no shared config, there can be no secondary processes */
- if (!internal_config.no_shconf) {
- /* if we can open the file but not get a write-lock we are a
- * secondary process. NOTE: if we get a file handle back, we
- * keep that open and don't close it to prevent a race condition
- * between multiple opens.
- */
- if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
- (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
- ptype = RTE_PROC_SECONDARY;
- }
-
- RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
- ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
-
- return ptype;
-}
-
-/* Sets up rte_config structure with the pointer to shared memory config.*/
-static int
-rte_config_init(void)
-{
- rte_config.process_type = internal_config.process_type;
-
- switch (rte_config.process_type){
- case RTE_PROC_PRIMARY:
- if (rte_eal_config_create() < 0)
- return -1;
- eal_mcfg_update_from_internal();
- break;
- case RTE_PROC_SECONDARY:
- if (rte_eal_config_attach() < 0)
- return -1;
- eal_mcfg_wait_complete();
- if (eal_mcfg_check_version() < 0) {
- RTE_LOG(ERR, EAL, "Primary and secondary process DPDK version mismatch\n");
- return -1;
- }
- if (rte_eal_config_reattach() < 0)
- return -1;
- eal_mcfg_update_internal();
- break;
- case RTE_PROC_AUTO:
- case RTE_PROC_INVALID:
- RTE_LOG(ERR, EAL, "Invalid process type %d\n",
- rte_config.process_type);
- return -1;
- }
-
- return 0;
-}
-
-/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
-static void
-eal_hugedirs_unlock(void)
-{
- int i;
-
- for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
- {
- /* skip uninitialized */
- if (internal_config.hugepage_info[i].lock_descriptor < 0)
- continue;
- /* unlock hugepage file */
- flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN);
- close(internal_config.hugepage_info[i].lock_descriptor);
- /* reset the field */
- internal_config.hugepage_info[i].lock_descriptor = -1;
- }
-}
-
-/* display usage */
-static void
-eal_usage(const char *prgname)
-{
- printf("\nUsage: %s ", prgname);
- eal_common_usage();
- printf("EAL Linux options:\n"
- " --"OPT_SOCKET_MEM" Memory to allocate on sockets (comma separated values)\n"
- " --"OPT_SOCKET_LIMIT" Limit memory allocation on sockets (comma separated values)\n"
- " --"OPT_HUGE_DIR" Directory where hugetlbfs is mounted\n"
- " --"OPT_FILE_PREFIX" Prefix for hugepage filenames\n"
- " --"OPT_CREATE_UIO_DEV" Create /dev/uioX (usually done by hotplug)\n"
- " --"OPT_VFIO_INTR" Interrupt mode for VFIO (legacy|msi|msix)\n"
- " --"OPT_LEGACY_MEM" Legacy memory mode (no dynamic allocation, contiguous segments)\n"
- " --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
- " --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n"
- "\n");
- /* Allow the application to print its usage message too if hook is set */
- if ( rte_application_usage_hook ) {
- printf("===== Application Usage =====\n\n");
- rte_application_usage_hook(prgname);
- }
-}
-
-/* Set a per-application usage message */
-rte_usage_hook_t
-rte_set_application_usage_hook( rte_usage_hook_t usage_func )
-{
- rte_usage_hook_t old_func;
-
- /* Will be NULL on the first call to denote the last usage routine. */
- old_func = rte_application_usage_hook;
- rte_application_usage_hook = usage_func;
-
- return old_func;
-}
-
-static int
-eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg)
-{
- char * arg[RTE_MAX_NUMA_NODES];
- char *end;
- int arg_num, i, len;
- uint64_t total_mem = 0;
-
- len = strnlen(strval, SOCKET_MEM_STRLEN);
- if (len == SOCKET_MEM_STRLEN) {
- RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
- return -1;
- }
-
- /* all other error cases will be caught later */
- if (!isdigit(strval[len-1]))
- return -1;
-
- /* split the optarg into separate socket values */
- arg_num = rte_strsplit(strval, len,
- arg, RTE_MAX_NUMA_NODES, ',');
-
- /* if split failed, or 0 arguments */
- if (arg_num <= 0)
- return -1;
-
- /* parse each defined socket option */
- errno = 0;
- for (i = 0; i < arg_num; i++) {
- uint64_t val;
- end = NULL;
- val = strtoull(arg[i], &end, 10);
-
- /* check for invalid input */
- if ((errno != 0) ||
- (arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
- return -1;
- val <<= 20;
- total_mem += val;
- socket_arg[i] = val;
- }
-
- return 0;
-}
-
-static int
-eal_parse_vfio_intr(const char *mode)
-{
- unsigned i;
- static struct {
- const char *name;
- enum rte_intr_mode value;
- } map[] = {
- { "legacy", RTE_INTR_MODE_LEGACY },
- { "msi", RTE_INTR_MODE_MSI },
- { "msix", RTE_INTR_MODE_MSIX },
- };
-
- for (i = 0; i < RTE_DIM(map); i++) {
- if (!strcmp(mode, map[i].name)) {
- internal_config.vfio_intr_mode = map[i].value;
- return 0;
- }
- }
- return -1;
-}
-
-/* Parse the arguments for --log-level only */
-static void
-eal_log_level_parse(int argc, char **argv)
-{
- int opt;
- char **argvopt;
- int option_index;
- const int old_optind = optind;
- const int old_optopt = optopt;
- char * const old_optarg = optarg;
-
- argvopt = argv;
- optind = 1;
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- int ret;
-
- /* getopt is not happy, stop right now */
- if (opt == '?')
- break;
-
- ret = (opt == OPT_LOG_LEVEL_NUM) ?
- eal_parse_common_option(opt, optarg, &internal_config) : 0;
-
- /* common parser is not happy */
- if (ret < 0)
- break;
- }
-
- /* restore getopt lib */
- optind = old_optind;
- optopt = old_optopt;
- optarg = old_optarg;
-}
-
-/* Parse the argument given in the command line of the application */
-static int
-eal_parse_args(int argc, char **argv)
-{
- int opt, ret;
- char **argvopt;
- int option_index;
- char *prgname = argv[0];
- const int old_optind = optind;
- const int old_optopt = optopt;
- char * const old_optarg = optarg;
-
- argvopt = argv;
- optind = 1;
- opterr = 0;
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- /*
- * getopt didn't recognise the option, lets parse the
- * registered options to see if the flag is valid
- */
- if (opt == '?') {
- ret = rte_option_parse(argv[optind-1]);
- if (ret == 0)
- continue;
-
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
-
- ret = eal_parse_common_option(opt, optarg, &internal_config);
- /* common parser is not happy */
- if (ret < 0) {
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- /* common parser handled this option */
- if (ret == 0)
- continue;
-
- switch (opt) {
- case 'h':
- eal_usage(prgname);
- exit(EXIT_SUCCESS);
-
- case OPT_HUGE_DIR_NUM:
- {
- char *hdir = strdup(optarg);
- if (hdir == NULL)
- RTE_LOG(ERR, EAL, "Could not store hugepage directory\n");
- else {
- /* free old hugepage dir */
- if (internal_config.hugepage_dir != NULL)
- free(internal_config.hugepage_dir);
- internal_config.hugepage_dir = hdir;
- }
- break;
- }
- case OPT_FILE_PREFIX_NUM:
- {
- char *prefix = strdup(optarg);
- if (prefix == NULL)
- RTE_LOG(ERR, EAL, "Could not store file prefix\n");
- else {
- /* free old prefix */
- if (internal_config.hugefile_prefix != NULL)
- free(internal_config.hugefile_prefix);
- internal_config.hugefile_prefix = prefix;
- }
- break;
- }
- case OPT_SOCKET_MEM_NUM:
- if (eal_parse_socket_arg(optarg,
- internal_config.socket_mem) < 0) {
- RTE_LOG(ERR, EAL, "invalid parameters for --"
- OPT_SOCKET_MEM "\n");
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- internal_config.force_sockets = 1;
- break;
-
- case OPT_SOCKET_LIMIT_NUM:
- if (eal_parse_socket_arg(optarg,
- internal_config.socket_limit) < 0) {
- RTE_LOG(ERR, EAL, "invalid parameters for --"
- OPT_SOCKET_LIMIT "\n");
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- internal_config.force_socket_limits = 1;
- break;
-
- case OPT_VFIO_INTR_NUM:
- if (eal_parse_vfio_intr(optarg) < 0) {
- RTE_LOG(ERR, EAL, "invalid parameters for --"
- OPT_VFIO_INTR "\n");
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- break;
-
- case OPT_CREATE_UIO_DEV_NUM:
- internal_config.create_uio_dev = 1;
- break;
-
- case OPT_MBUF_POOL_OPS_NAME_NUM:
- {
- char *ops_name = strdup(optarg);
- if (ops_name == NULL)
- RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
- else {
- /* free old ops name */
- if (internal_config.user_mbuf_pool_ops_name !=
- NULL)
- free(internal_config.user_mbuf_pool_ops_name);
-
- internal_config.user_mbuf_pool_ops_name =
- ops_name;
- }
- break;
- }
- case OPT_MATCH_ALLOCATIONS_NUM:
- internal_config.match_allocations = 1;
- break;
-
- default:
- if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
- RTE_LOG(ERR, EAL, "Option %c is not supported "
- "on Linux\n", opt);
- } else if (opt >= OPT_LONG_MIN_NUM &&
- opt < OPT_LONG_MAX_NUM) {
- RTE_LOG(ERR, EAL, "Option %s is not supported "
- "on Linux\n",
- eal_long_options[option_index].name);
- } else {
- RTE_LOG(ERR, EAL, "Option %d is not supported "
- "on Linux\n", opt);
- }
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
- }
-
- /* create runtime data directory */
- if (internal_config.no_shconf == 0 &&
- eal_create_runtime_dir() < 0) {
- RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
- ret = -1;
- goto out;
- }
-
- if (eal_adjust_config(&internal_config) != 0) {
- ret = -1;
- goto out;
- }
-
- /* sanity checks */
- if (eal_check_common_options(&internal_config) != 0) {
- eal_usage(prgname);
- ret = -1;
- goto out;
- }
-
- if (optind >= 0)
- argv[optind-1] = prgname;
- ret = optind-1;
-
-out:
- /* restore getopt lib */
- optind = old_optind;
- optopt = old_optopt;
- optarg = old_optarg;
-
- return ret;
-}
-
-static int
-check_socket(const struct rte_memseg_list *msl, void *arg)
-{
- int *socket_id = arg;
-
- if (msl->external)
- return 0;
-
- return *socket_id == msl->socket_id;
-}
-
-static void
-eal_check_mem_on_local_socket(void)
-{
- int socket_id;
-
- socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
-
- if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
- RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
-}
-
-static int
-sync_func(__attribute__((unused)) void *arg)
-{
- return 0;
-}
-
-/*
- * Request iopl privilege for all RPL, returns 0 on success
- * iopl() call is mostly for the i386 architecture. For other architectures,
- * return -1 to indicate IO privilege can't be changed in this way.
- */
-int
-rte_eal_iopl_init(void)
-{
-#if defined(RTE_ARCH_X86)
- if (iopl(3) != 0)
- return -1;
-#endif
- return 0;
-}
-
-#ifdef VFIO_PRESENT
-static int rte_eal_vfio_setup(void)
-{
- if (rte_vfio_enable("vfio"))
- return -1;
-
- return 0;
-}
-#endif
-
-static void rte_eal_init_alert(const char *msg)
-{
- fprintf(stderr, "EAL: FATAL: %s\n", msg);
- RTE_LOG(ERR, EAL, "%s\n", msg);
-}
-
-/*
- * On Linux 3.6+, even if VFIO is not loaded, whenever IOMMU is enabled in the
- * BIOS and in the kernel, /sys/kernel/iommu_groups path will contain kernel
- * IOMMU groups. If IOMMU is not enabled, that path would be empty.
- * Therefore, checking if the path is empty will tell us if IOMMU is enabled.
- */
-static bool
-is_iommu_enabled(void)
-{
- DIR *dir = opendir(KERNEL_IOMMU_GROUPS_PATH);
- struct dirent *d;
- int n = 0;
-
- /* if directory doesn't exist, assume IOMMU is not enabled */
- if (dir == NULL)
- return false;
-
- while ((d = readdir(dir)) != NULL) {
- /* skip dot and dot-dot */
- if (++n > 2)
- break;
- }
- closedir(dir);
-
- return n > 2;
-}
-
-/* Launch threads, called at application init(). */
-int
-rte_eal_init(int argc, char **argv)
-{
- int i, fctret, ret;
- pthread_t thread_id;
- static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
- const char *p;
- static char logid[PATH_MAX];
- char cpuset[RTE_CPU_AFFINITY_STR_LEN];
- char thread_name[RTE_MAX_THREAD_NAME_LEN];
- bool phys_addrs;
-
- /* checks if the machine is adequate */
- if (!rte_cpu_is_supported()) {
- rte_eal_init_alert("unsupported cpu type.");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- if (!rte_atomic32_test_and_set(&run_once)) {
- rte_eal_init_alert("already called initialization.");
- rte_errno = EALREADY;
- return -1;
- }
-
- p = strrchr(argv[0], '/');
- strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
- thread_id = pthread_self();
-
- eal_reset_internal_config(&internal_config);
-
- /* set log level as early as possible */
- eal_log_level_parse(argc, argv);
-
- if (rte_eal_cpu_init() < 0) {
- rte_eal_init_alert("Cannot detect lcores.");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- fctret = eal_parse_args(argc, argv);
- if (fctret < 0) {
- rte_eal_init_alert("Invalid 'command line' arguments.");
- rte_errno = EINVAL;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- if (eal_plugins_init() < 0) {
- rte_eal_init_alert("Cannot init plugins");
- rte_errno = EINVAL;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- if (eal_option_device_parse()) {
- rte_errno = ENODEV;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- if (rte_config_init() < 0) {
- rte_eal_init_alert("Cannot init config");
- return -1;
- }
-
- if (rte_eal_intr_init() < 0) {
- rte_eal_init_alert("Cannot init interrupt-handling thread");
- return -1;
- }
-
- if (rte_eal_alarm_init() < 0) {
- rte_eal_init_alert("Cannot init alarm");
- /* rte_eal_alarm_init sets rte_errno on failure. */
- return -1;
- }
-
- /* Put mp channel init before bus scan so that we can init the vdev
- * bus through mp channel in the secondary process before the bus scan.
- */
- if (rte_mp_channel_init() < 0 && rte_errno != ENOTSUP) {
- rte_eal_init_alert("failed to init mp channel");
- if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
- rte_errno = EFAULT;
- return -1;
- }
- }
-
- /* register multi-process action callbacks for hotplug */
- if (eal_mp_dev_hotplug_init() < 0) {
- rte_eal_init_alert("failed to register mp callback for hotplug");
- return -1;
- }
-
- if (rte_bus_scan()) {
- rte_eal_init_alert("Cannot scan the buses for devices");
- rte_errno = ENODEV;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
- phys_addrs = rte_eal_using_phys_addrs() != 0;
-
- /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
- if (internal_config.iova_mode == RTE_IOVA_DC) {
- /* autodetect the IOVA mapping mode */
- enum rte_iova_mode iova_mode = rte_bus_get_iommu_class();
-
- if (iova_mode == RTE_IOVA_DC) {
- RTE_LOG(DEBUG, EAL, "Buses did not request a specific IOVA mode.\n");
-
- if (!phys_addrs) {
- /* if we have no access to physical addresses,
- * pick IOVA as VA mode.
- */
- iova_mode = RTE_IOVA_VA;
- RTE_LOG(DEBUG, EAL, "Physical addresses are unavailable, selecting IOVA as VA mode.\n");
-#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0)
- } else if (rte_eal_check_module("rte_kni") == 1) {
- iova_mode = RTE_IOVA_PA;
- RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n");
-#endif
- } else if (is_iommu_enabled()) {
- /* we have an IOMMU, pick IOVA as VA mode */
- iova_mode = RTE_IOVA_VA;
- RTE_LOG(DEBUG, EAL, "IOMMU is available, selecting IOVA as VA mode.\n");
- } else {
- /* physical addresses available, and no IOMMU
- * found, so pick IOVA as PA.
- */
- iova_mode = RTE_IOVA_PA;
- RTE_LOG(DEBUG, EAL, "IOMMU is not available, selecting IOVA as PA mode.\n");
- }
- }
-#if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 10, 0)
- /* Workaround for KNI which requires physical address to work
- * in kernels < 4.10
- */
- if (iova_mode == RTE_IOVA_VA &&
- rte_eal_check_module("rte_kni") == 1) {
- if (phys_addrs) {
- iova_mode = RTE_IOVA_PA;
- RTE_LOG(WARNING, EAL, "Forcing IOVA as 'PA' because KNI module is loaded\n");
- } else {
- RTE_LOG(DEBUG, EAL, "KNI can not work since physical addresses are unavailable\n");
- }
- }
-#endif
- rte_eal_get_configuration()->iova_mode = iova_mode;
- } else {
- rte_eal_get_configuration()->iova_mode =
- internal_config.iova_mode;
- }
-
- if (rte_eal_iova_mode() == RTE_IOVA_PA && !phys_addrs) {
- rte_eal_init_alert("Cannot use IOVA as 'PA' since physical addresses are not available");
- rte_errno = EINVAL;
- return -1;
- }
-
- RTE_LOG(INFO, EAL, "Selected IOVA mode '%s'\n",
- rte_eal_iova_mode() == RTE_IOVA_PA ? "PA" : "VA");
-
- if (internal_config.no_hugetlbfs == 0) {
- /* rte_config isn't initialized yet */
- ret = internal_config.process_type == RTE_PROC_PRIMARY ?
- eal_hugepage_info_init() :
- eal_hugepage_info_read();
- if (ret < 0) {
- rte_eal_init_alert("Cannot get hugepage information.");
- rte_errno = EACCES;
- rte_atomic32_clear(&run_once);
- return -1;
- }
- }
-
- if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
- if (internal_config.no_hugetlbfs)
- internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
- }
-
- if (internal_config.vmware_tsc_map == 1) {
-#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
- rte_cycles_vmware_tsc_map = 1;
- RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
- "you must have monitor_control.pseudo_perfctr = TRUE\n");
-#else
- RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
- "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
-#endif
- }
-
- if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
- rte_eal_init_alert("Cannot init logging.");
- rte_errno = ENOMEM;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-
-#ifdef VFIO_PRESENT
- if (rte_eal_vfio_setup() < 0) {
- rte_eal_init_alert("Cannot init VFIO");
- rte_errno = EAGAIN;
- rte_atomic32_clear(&run_once);
- return -1;
- }
-#endif
- /* in secondary processes, memory init may allocate additional fbarrays
- * not present in primary processes, so to avoid any potential issues,
- * initialize memzones first.
- */
- if (rte_eal_memzone_init() < 0) {
- rte_eal_init_alert("Cannot init memzone");
- rte_errno = ENODEV;
- return -1;
- }
-
- if (rte_eal_memory_init() < 0) {
- rte_eal_init_alert("Cannot init memory");
- rte_errno = ENOMEM;
- return -1;
- }
-
- /* the directories are locked during eal_hugepage_info_init */
- eal_hugedirs_unlock();
-
- if (rte_eal_malloc_heap_init() < 0) {
- rte_eal_init_alert("Cannot init malloc heap");
- rte_errno = ENODEV;
- return -1;
- }
-
- if (rte_eal_tailqs_init() < 0) {
- rte_eal_init_alert("Cannot init tail queues for objects");
- rte_errno = EFAULT;
- return -1;
- }
-
- if (rte_eal_timer_init() < 0) {
- rte_eal_init_alert("Cannot init HPET or TSC timers");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- eal_check_mem_on_local_socket();
-
- eal_thread_init_master(rte_config.master_lcore);
-
- ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
- RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
- rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
- ret == 0 ? "" : "...");
-
- RTE_LCORE_FOREACH_SLAVE(i) {
-
- /*
- * create communication pipes between master thread
- * and children
- */
- if (pipe(lcore_config[i].pipe_master2slave) < 0)
- rte_panic("Cannot create pipe\n");
- if (pipe(lcore_config[i].pipe_slave2master) < 0)
- rte_panic("Cannot create pipe\n");
-
- lcore_config[i].state = WAIT;
-
- /* create a thread for each lcore */
- ret = pthread_create(&lcore_config[i].thread_id, NULL,
- eal_thread_loop, NULL);
- if (ret != 0)
- rte_panic("Cannot create thread\n");
-
- /* Set thread_name for aid in debugging. */
- snprintf(thread_name, sizeof(thread_name),
- "lcore-slave-%d", i);
- ret = rte_thread_setname(lcore_config[i].thread_id,
- thread_name);
- if (ret != 0)
- RTE_LOG(DEBUG, EAL,
- "Cannot set name for lcore thread\n");
- }
-
- /*
- * Launch a dummy function on all slave lcores, so that master lcore
- * knows they are all ready when this function returns.
- */
- rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
- rte_eal_mp_wait_lcore();
-
- /* initialize services so vdevs register service during bus_probe. */
- ret = rte_service_init();
- if (ret) {
- rte_eal_init_alert("rte_service_init() failed");
- rte_errno = ENOEXEC;
- return -1;
- }
-
- /* Probe all the buses and devices/drivers on them */
- if (rte_bus_probe()) {
- rte_eal_init_alert("Cannot probe devices");
- rte_errno = ENOTSUP;
- return -1;
- }
-
-#ifdef VFIO_PRESENT
- /* Register mp action after probe() so that we got enough info */
- if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
- return -1;
-#endif
-
- /* initialize default service/lcore mappings and start running. Ignore
- * -ENOTSUP, as it indicates no service coremask passed to EAL.
- */
- ret = rte_service_start_with_defaults();
- if (ret < 0 && ret != -ENOTSUP) {
- rte_errno = ENOEXEC;
- return -1;
- }
-
- /*
- * Clean up unused files in runtime directory. We do this at the end of
- * init and not at the beginning because we want to clean stuff up
- * whether we are primary or secondary process, but we cannot remove
- * primary process' files because secondary should be able to run even
- * if primary process is dead.
- *
- * In no_shconf mode, no runtime directory is created in the first
- * place, so no cleanup needed.
- */
- if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
- rte_eal_init_alert("Cannot clear runtime directory\n");
- return -1;
- }
-
- eal_mcfg_complete();
-
- /* Call each registered callback, if enabled */
- rte_option_init();
-
- return fctret;
-}
-
-static int
-mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
- void *arg __rte_unused)
-{
- /* ms is const, so find this memseg */
- struct rte_memseg *found;
-
- if (msl->external)
- return 0;
-
- found = rte_mem_virt2memseg(ms->addr, msl);
-
- found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
-
- return 0;
-}
-
-int
-rte_eal_cleanup(void)
-{
- /* if we're in a primary process, we need to mark hugepages as freeable
- * so that finalization can release them back to the system.
- */
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
- rte_memseg_walk(mark_freeable, NULL);
- rte_service_finalize();
- rte_mp_channel_cleanup();
- eal_cleanup_config(&internal_config);
- return 0;
-}
-
-enum rte_proc_type_t
-rte_eal_process_type(void)
-{
- return rte_config.process_type;
-}
-
-int rte_eal_has_hugepages(void)
-{
- return ! internal_config.no_hugetlbfs;
-}
-
-int rte_eal_has_pci(void)
-{
- return !internal_config.no_pci;
-}
-
-int rte_eal_create_uio_dev(void)
-{
- return internal_config.create_uio_dev;
-}
-
-enum rte_intr_mode
-rte_eal_vfio_intr_mode(void)
-{
- return internal_config.vfio_intr_mode;
-}
-
-int
-rte_eal_check_module(const char *module_name)
-{
- char sysfs_mod_name[PATH_MAX];
- struct stat st;
- int n;
-
- if (NULL == module_name)
- return -1;
-
- /* Check if there is sysfs mounted */
- if (stat("/sys/module", &st) != 0) {
- RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- /* A module might be built-in, therefore try sysfs */
- n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name);
- if (n < 0 || n > PATH_MAX) {
- RTE_LOG(DEBUG, EAL, "Could not format module path\n");
- return -1;
- }
-
- if (stat(sysfs_mod_name, &st) != 0) {
- RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n",
- sysfs_mod_name, errno, strerror(errno));
- return 0;
- }
-
- /* Module has been found */
- return 1;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-#include <stdio.h>
-#include <stdint.h>
-#include <signal.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/queue.h>
-#include <sys/time.h>
-#include <sys/timerfd.h>
-
-#include <rte_memory.h>
-#include <rte_interrupts.h>
-#include <rte_alarm.h>
-#include <rte_common.h>
-#include <rte_per_lcore.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_lcore.h>
-#include <rte_errno.h>
-#include <rte_spinlock.h>
-#include <eal_private.h>
-
-#ifndef TFD_NONBLOCK
-#include <fcntl.h>
-#define TFD_NONBLOCK O_NONBLOCK
-#endif
-
-#define NS_PER_US 1000
-#define US_PER_MS 1000
-#define MS_PER_S 1000
-#ifndef US_PER_S
-#define US_PER_S (US_PER_MS * MS_PER_S)
-#endif
-
-#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
-#else
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC
-#endif
-
-struct alarm_entry {
- LIST_ENTRY(alarm_entry) next;
- struct timeval time;
- rte_eal_alarm_callback cb_fn;
- void *cb_arg;
- volatile uint8_t executing;
- volatile pthread_t executing_id;
-};
-
-static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
-static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
-
-static struct rte_intr_handle intr_handle = {.fd = -1 };
-static int handler_registered = 0;
-static void eal_alarm_callback(void *arg);
-
-int
-rte_eal_alarm_init(void)
-{
- intr_handle.type = RTE_INTR_HANDLE_ALARM;
- /* create a timerfd file descriptor */
- intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
- if (intr_handle.fd == -1)
- goto error;
-
- return 0;
-
-error:
- rte_errno = errno;
- return -1;
-}
-
-static void
-eal_alarm_callback(void *arg __rte_unused)
-{
- struct timespec now;
- struct alarm_entry *ap;
-
- rte_spinlock_lock(&alarm_list_lk);
- while ((ap = LIST_FIRST(&alarm_list)) !=NULL &&
- clock_gettime(CLOCK_TYPE_ID, &now) == 0 &&
- (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec &&
- (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) {
- ap->executing = 1;
- ap->executing_id = pthread_self();
- rte_spinlock_unlock(&alarm_list_lk);
-
- ap->cb_fn(ap->cb_arg);
-
- rte_spinlock_lock(&alarm_list_lk);
-
- LIST_REMOVE(ap, next);
- free(ap);
- }
-
- if (!LIST_EMPTY(&alarm_list)) {
- struct itimerspec atime = { .it_interval = { 0, 0 } };
-
- ap = LIST_FIRST(&alarm_list);
- atime.it_value.tv_sec = ap->time.tv_sec;
- atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US;
- /* perform borrow for subtraction if necessary */
- if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US))
- atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US;
-
- atime.it_value.tv_sec -= now.tv_sec;
- atime.it_value.tv_nsec -= now.tv_nsec;
- timerfd_settime(intr_handle.fd, 0, &atime, NULL);
- }
- rte_spinlock_unlock(&alarm_list_lk);
-}
-
-int
-rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
- struct timespec now;
- int ret = 0;
- struct alarm_entry *ap, *new_alarm;
-
- /* Check parameters, including that us won't cause a uint64_t overflow */
- if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
- return -EINVAL;
-
- new_alarm = calloc(1, sizeof(*new_alarm));
- if (new_alarm == NULL)
- return -ENOMEM;
-
- /* use current time to calculate absolute time of alarm */
- clock_gettime(CLOCK_TYPE_ID, &now);
-
- new_alarm->cb_fn = cb_fn;
- new_alarm->cb_arg = cb_arg;
- new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S;
- new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S);
-
- rte_spinlock_lock(&alarm_list_lk);
- if (!handler_registered) {
- /* registration can fail, callback can be registered later */
- if (rte_intr_callback_register(&intr_handle,
- eal_alarm_callback, NULL) == 0)
- handler_registered = 1;
- }
-
- if (LIST_EMPTY(&alarm_list))
- LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
- else {
- LIST_FOREACH(ap, &alarm_list, next) {
- if (ap->time.tv_sec > new_alarm->time.tv_sec ||
- (ap->time.tv_sec == new_alarm->time.tv_sec &&
- ap->time.tv_usec > new_alarm->time.tv_usec)){
- LIST_INSERT_BEFORE(ap, new_alarm, next);
- break;
- }
- if (LIST_NEXT(ap, next) == NULL) {
- LIST_INSERT_AFTER(ap, new_alarm, next);
- break;
- }
- }
- }
-
- if (LIST_FIRST(&alarm_list) == new_alarm) {
- struct itimerspec alarm_time = {
- .it_interval = {0, 0},
- .it_value = {
- .tv_sec = us / US_PER_S,
- .tv_nsec = (us % US_PER_S) * NS_PER_US,
- },
- };
- ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL);
- }
- rte_spinlock_unlock(&alarm_list_lk);
-
- return ret;
-}
-
-int
-rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
- struct alarm_entry *ap, *ap_prev;
- int count = 0;
- int err = 0;
- int executing;
-
- if (!cb_fn) {
- rte_errno = EINVAL;
- return -1;
- }
-
- do {
- executing = 0;
- rte_spinlock_lock(&alarm_list_lk);
- /* remove any matches at the start of the list */
- while ((ap = LIST_FIRST(&alarm_list)) != NULL &&
- cb_fn == ap->cb_fn &&
- (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
-
- if (ap->executing == 0) {
- LIST_REMOVE(ap, next);
- free(ap);
- count++;
- } else {
- /* If calling from other context, mark that alarm is executing
- * so loop can spin till it finish. Otherwise we are trying to
- * cancel our self - mark it by EINPROGRESS */
- if (pthread_equal(ap->executing_id, pthread_self()) == 0)
- executing++;
- else
- err = EINPROGRESS;
-
- break;
- }
- }
- ap_prev = ap;
-
- /* now go through list, removing entries not at start */
- LIST_FOREACH(ap, &alarm_list, next) {
- /* this won't be true first time through */
- if (cb_fn == ap->cb_fn &&
- (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
-
- if (ap->executing == 0) {
- LIST_REMOVE(ap, next);
- free(ap);
- count++;
- ap = ap_prev;
- } else if (pthread_equal(ap->executing_id, pthread_self()) == 0)
- executing++;
- else
- err = EINPROGRESS;
- }
- ap_prev = ap;
- }
- rte_spinlock_unlock(&alarm_list_lk);
- } while (executing != 0);
-
- if (count == 0 && err == 0)
- rte_errno = ENOENT;
- else if (err)
- rte_errno = err;
-
- return count;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright 2018 Red Hat, Inc.
- */
-
-#include <elf.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 16)
-#include <sys/auxv.h>
-#define HAS_AUXV 1
-#endif
-#endif
-
-#include <rte_cpuflags.h>
-
-#ifndef HAS_AUXV
-static unsigned long
-getauxval(unsigned long type __rte_unused)
-{
- errno = ENOTSUP;
- return 0;
-}
-#endif
-
-#ifdef RTE_ARCH_64
-typedef Elf64_auxv_t Internal_Elfx_auxv_t;
-#else
-typedef Elf32_auxv_t Internal_Elfx_auxv_t;
-#endif
-
-/**
- * Provides a method for retrieving values from the auxiliary vector and
- * possibly running a string comparison.
- *
- * @return Always returns a result. When the result is 0, check errno
- * to see if an error occurred during processing.
- */
-static unsigned long
-_rte_cpu_getauxval(unsigned long type, const char *str)
-{
- unsigned long val;
-
- errno = 0;
- val = getauxval(type);
-
- if (!val && (errno == ENOTSUP || errno == ENOENT)) {
- int auxv_fd = open("/proc/self/auxv", O_RDONLY);
- Internal_Elfx_auxv_t auxv;
-
- if (auxv_fd == -1)
- return 0;
-
- errno = ENOENT;
- while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
- if (auxv.a_type == type) {
- errno = 0;
- val = auxv.a_un.a_val;
- if (str)
- val = strcmp((const char *)val, str);
- break;
- }
- }
- close(auxv_fd);
- }
-
- return val;
-}
-
-unsigned long
-rte_cpu_getauxval(unsigned long type)
-{
- return _rte_cpu_getauxval(type, NULL);
-}
-
-int
-rte_cpu_strcmp_auxval(unsigned long type, const char *str)
-{
- return _rte_cpu_getauxval(type, str);
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#ifdef RTE_BACKTRACE
-#include <execinfo.h>
-#endif
-#include <stdarg.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-
-#include <rte_log.h>
-#include <rte_debug.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-
-#define BACKTRACE_SIZE 256
-
-/* dump the stack of the calling core */
-void rte_dump_stack(void)
-{
-#ifdef RTE_BACKTRACE
- void *func[BACKTRACE_SIZE];
- char **symb = NULL;
- int size;
-
- size = backtrace(func, BACKTRACE_SIZE);
- symb = backtrace_symbols(func, size);
-
- if (symb == NULL)
- return;
-
- while (size > 0) {
- rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
- "%d: [%s]\n", size, symb[size - 1]);
- size --;
- }
-
- free(symb);
-#endif /* RTE_BACKTRACE */
-}
-
-/* not implemented in this environment */
-void rte_dump_registers(void)
-{
- return;
-}
-
-/* call abort(), it will generate a coredump if enabled */
-void __rte_panic(const char *funcname, const char *format, ...)
-{
- va_list ap;
-
- rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
- va_start(ap, format);
- rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
- va_end(ap);
- rte_dump_stack();
- rte_dump_registers();
- abort();
-}
-
-/*
- * Like rte_panic this terminates the application. However, no traceback is
- * provided and no core-dump is generated.
- */
-void
-rte_exit(int exit_code, const char *format, ...)
-{
- va_list ap;
-
- if (exit_code != 0)
- RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
- " Cause: ", exit_code);
-
- va_start(ap, format);
- rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
- va_end(ap);
-
-#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
- if (rte_eal_cleanup() != 0)
- RTE_LOG(CRIT, EAL,
- "EAL could not release all resources\n");
- exit(exit_code);
-#else
- rte_dump_stack();
- rte_dump_registers();
- abort();
-#endif
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018 Intel Corporation
- */
-
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <signal.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-
-#include <rte_string_fns.h>
-#include <rte_log.h>
-#include <rte_compat.h>
-#include <rte_dev.h>
-#include <rte_malloc.h>
-#include <rte_interrupts.h>
-#include <rte_alarm.h>
-#include <rte_bus.h>
-#include <rte_eal.h>
-#include <rte_spinlock.h>
-#include <rte_errno.h>
-
-#include "eal_private.h"
-
-static struct rte_intr_handle intr_handle = {.fd = -1 };
-static bool monitor_started;
-static bool hotplug_handle;
-
-#define EAL_UEV_MSG_LEN 4096
-#define EAL_UEV_MSG_ELEM_LEN 128
-
-/*
- * spinlock for device hot-unplug failure handling. If it try to access bus or
- * device, such as handle sigbus on bus or handle memory failure for device
- * just need to use this lock. It could protect the bus and the device to avoid
- * race condition.
- */
-static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
-
-static struct sigaction sigbus_action_old;
-
-static int sigbus_need_recover;
-
-static void dev_uev_handler(__rte_unused void *param);
-
-/* identify the system layer which reports this event. */
-enum eal_dev_event_subsystem {
- EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */
- EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */
- EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */
- EAL_DEV_EVENT_SUBSYSTEM_MAX
-};
-
-static void
-sigbus_action_recover(void)
-{
- if (sigbus_need_recover) {
- sigaction(SIGBUS, &sigbus_action_old, NULL);
- sigbus_need_recover = 0;
- }
-}
-
-static void sigbus_handler(int signum, siginfo_t *info,
- void *ctx __rte_unused)
-{
- int ret;
-
- RTE_LOG(DEBUG, EAL, "Thread catch SIGBUS, fault address:%p\n",
- info->si_addr);
-
- rte_spinlock_lock(&failure_handle_lock);
- ret = rte_bus_sigbus_handler(info->si_addr);
- rte_spinlock_unlock(&failure_handle_lock);
- if (ret == -1) {
- rte_exit(EXIT_FAILURE,
- "Failed to handle SIGBUS for hot-unplug, "
- "(rte_errno: %s)!", strerror(rte_errno));
- } else if (ret == 1) {
- if (sigbus_action_old.sa_flags == SA_SIGINFO
- && sigbus_action_old.sa_sigaction) {
- (*(sigbus_action_old.sa_sigaction))(signum,
- info, ctx);
- } else if (sigbus_action_old.sa_flags != SA_SIGINFO
- && sigbus_action_old.sa_handler) {
- (*(sigbus_action_old.sa_handler))(signum);
- } else {
- rte_exit(EXIT_FAILURE,
- "Failed to handle generic SIGBUS!");
- }
- }
-
- RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
-}
-
-static int cmp_dev_name(const struct rte_device *dev,
- const void *_name)
-{
- const char *name = _name;
-
- return strcmp(dev->name, name);
-}
-
-static int
-dev_uev_socket_fd_create(void)
-{
- struct sockaddr_nl addr;
- int ret;
-
- intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
- SOCK_NONBLOCK,
- NETLINK_KOBJECT_UEVENT);
- if (intr_handle.fd < 0) {
- RTE_LOG(ERR, EAL, "create uevent fd failed.\n");
- return -1;
- }
-
- memset(&addr, 0, sizeof(addr));
- addr.nl_family = AF_NETLINK;
- addr.nl_pid = 0;
- addr.nl_groups = 0xffffffff;
-
- ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr));
- if (ret < 0) {
- RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n");
- goto err;
- }
-
- return 0;
-err:
- close(intr_handle.fd);
- intr_handle.fd = -1;
- return ret;
-}
-
-static int
-dev_uev_parse(const char *buf, struct rte_dev_event *event, int length)
-{
- char action[EAL_UEV_MSG_ELEM_LEN];
- char subsystem[EAL_UEV_MSG_ELEM_LEN];
- char pci_slot_name[EAL_UEV_MSG_ELEM_LEN];
- int i = 0;
-
- memset(action, 0, EAL_UEV_MSG_ELEM_LEN);
- memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN);
- memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN);
-
- while (i < length) {
- for (; i < length; i++) {
- if (*buf)
- break;
- buf++;
- }
- /**
- * check device uevent from kernel side, no need to check
- * uevent from udev.
- */
- if (!strncmp(buf, "libudev", 7)) {
- buf += 7;
- i += 7;
- return -1;
- }
- if (!strncmp(buf, "ACTION=", 7)) {
- buf += 7;
- i += 7;
- strlcpy(action, buf, sizeof(action));
- } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
- buf += 10;
- i += 10;
- strlcpy(subsystem, buf, sizeof(subsystem));
- } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
- buf += 14;
- i += 14;
- strlcpy(pci_slot_name, buf, sizeof(subsystem));
- event->devname = strdup(pci_slot_name);
- }
- for (; i < length; i++) {
- if (*buf == '\0')
- break;
- buf++;
- }
- }
-
- /* parse the subsystem layer */
- if (!strncmp(subsystem, "uio", 3))
- event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO;
- else if (!strncmp(subsystem, "pci", 3))
- event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI;
- else if (!strncmp(subsystem, "vfio", 4))
- event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO;
- else
- return -1;
-
- /* parse the action type */
- if (!strncmp(action, "add", 3))
- event->type = RTE_DEV_EVENT_ADD;
- else if (!strncmp(action, "remove", 6))
- event->type = RTE_DEV_EVENT_REMOVE;
- else
- return -1;
- return 0;
-}
-
-static void
-dev_delayed_unregister(void *param)
-{
- rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param);
- close(intr_handle.fd);
- intr_handle.fd = -1;
-}
-
-static void
-dev_uev_handler(__rte_unused void *param)
-{
- struct rte_dev_event uevent;
- int ret;
- char buf[EAL_UEV_MSG_LEN];
- struct rte_bus *bus;
- struct rte_device *dev;
- const char *busname = "";
-
- memset(&uevent, 0, sizeof(struct rte_dev_event));
- memset(buf, 0, EAL_UEV_MSG_LEN);
-
- ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT);
- if (ret < 0 && errno == EAGAIN)
- return;
- else if (ret <= 0) {
- /* connection is closed or broken, can not up again. */
- RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n");
- rte_eal_alarm_set(1, dev_delayed_unregister, NULL);
- return;
- }
-
- ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN);
- if (ret < 0) {
- RTE_LOG(DEBUG, EAL, "It is not an valid event "
- "that need to be handle.\n");
- return;
- }
-
- RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
- uevent.devname, uevent.type, uevent.subsystem);
-
- switch (uevent.subsystem) {
- case EAL_DEV_EVENT_SUBSYSTEM_PCI:
- case EAL_DEV_EVENT_SUBSYSTEM_UIO:
- busname = "pci";
- break;
- default:
- break;
- }
-
- if (uevent.devname) {
- if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
- rte_spinlock_lock(&failure_handle_lock);
- bus = rte_bus_find_by_name(busname);
- if (bus == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
- busname);
- goto failure_handle_err;
- }
-
- dev = bus->find_device(NULL, cmp_dev_name,
- uevent.devname);
- if (dev == NULL) {
- RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
- "bus (%s)\n", uevent.devname, busname);
- goto failure_handle_err;
- }
-
- ret = bus->hot_unplug_handler(dev);
- if (ret) {
- RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
- "for device (%s)\n", dev->name);
- }
- rte_spinlock_unlock(&failure_handle_lock);
- }
- rte_dev_event_callback_process(uevent.devname, uevent.type);
- }
-
- return;
-
-failure_handle_err:
- rte_spinlock_unlock(&failure_handle_lock);
-}
-
-int
-rte_dev_event_monitor_start(void)
-{
- int ret;
-
- if (monitor_started)
- return 0;
-
- ret = dev_uev_socket_fd_create();
- if (ret) {
- RTE_LOG(ERR, EAL, "error create device event fd.\n");
- return -1;
- }
-
- intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT;
- ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "fail to register uevent callback.\n");
- return -1;
- }
-
- monitor_started = true;
-
- return 0;
-}
-
-int
-rte_dev_event_monitor_stop(void)
-{
- int ret;
-
- if (!monitor_started)
- return 0;
-
- ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler,
- (void *)-1);
- if (ret < 0) {
- RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n");
- return ret;
- }
-
- close(intr_handle.fd);
- intr_handle.fd = -1;
- monitor_started = false;
-
- return 0;
-}
-
-int
-dev_sigbus_handler_register(void)
-{
- sigset_t mask;
- struct sigaction action;
-
- rte_errno = 0;
-
- if (sigbus_need_recover)
- return 0;
-
- sigemptyset(&mask);
- sigaddset(&mask, SIGBUS);
- action.sa_flags = SA_SIGINFO;
- action.sa_mask = mask;
- action.sa_sigaction = sigbus_handler;
- sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
-
- return rte_errno;
-}
-
-int
-dev_sigbus_handler_unregister(void)
-{
- rte_errno = 0;
-
- sigbus_action_recover();
-
- return rte_errno;
-}
-
-int
-rte_dev_hotplug_handle_enable(void)
-{
- int ret = 0;
-
- ret = dev_sigbus_handler_register();
- if (ret < 0)
- RTE_LOG(ERR, EAL,
- "fail to register sigbus handler for devices.\n");
-
- hotplug_handle = true;
-
- return ret;
-}
-
-int
-rte_dev_hotplug_handle_disable(void)
-{
- int ret = 0;
-
- ret = dev_sigbus_handler_unregister();
- if (ret < 0)
- RTE_LOG(ERR, EAL,
- "fail to unregister sigbus handler for devices.\n");
-
- hotplug_handle = false;
-
- return ret;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/file.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <fnmatch.h>
-#include <inttypes.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <sys/stat.h>
-
-#include <linux/mman.h> /* for hugetlb-related flags */
-
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_debug.h>
-#include <rte_log.h>
-#include <rte_common.h>
-#include "rte_string_fns.h"
-#include "eal_internal_cfg.h"
-#include "eal_hugepages.h"
-#include "eal_filesystem.h"
-
-static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
-static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
-
-/*
- * Uses mmap to create a shared memory area for storage of data
- * Used in this file to store the hugepage file map on disk
- */
-static void *
-map_shared_memory(const char *filename, const size_t mem_size, int flags)
-{
- void *retval;
- int fd = open(filename, flags, 0600);
- if (fd < 0)
- return NULL;
- if (ftruncate(fd, mem_size) < 0) {
- close(fd);
- return NULL;
- }
- retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd);
- return retval;
-}
-
-static void *
-open_shared_memory(const char *filename, const size_t mem_size)
-{
- return map_shared_memory(filename, mem_size, O_RDWR);
-}
-
-static void *
-create_shared_memory(const char *filename, const size_t mem_size)
-{
- return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
-}
-
-static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val)
-{
- char path[PATH_MAX];
-
- snprintf(path, sizeof(path), "%s/%s/%s",
- sys_dir_path, subdir, file);
- return eal_parse_sysfs_value(path, val);
-}
-
-/* this function is only called from eal_hugepage_info_init which itself
- * is only called from a primary process */
-static uint32_t
-get_num_hugepages(const char *subdir)
-{
- unsigned long resv_pages, num_pages, over_pages, surplus_pages;
- const char *nr_hp_file = "free_hugepages";
- const char *nr_rsvd_file = "resv_hugepages";
- const char *nr_over_file = "nr_overcommit_hugepages";
- const char *nr_splus_file = "surplus_hugepages";
-
- /* first, check how many reserved pages kernel reports */
- if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)
- return 0;
-
- if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)
- return 0;
-
- if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)
- over_pages = 0;
-
- if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)
- surplus_pages = 0;
-
- /* adjust num_pages */
- if (num_pages >= resv_pages)
- num_pages -= resv_pages;
- else if (resv_pages)
- num_pages = 0;
-
- if (over_pages >= surplus_pages)
- over_pages -= surplus_pages;
- else
- over_pages = 0;
-
- if (num_pages == 0 && over_pages == 0)
- RTE_LOG(WARNING, EAL, "No available hugepages reported in %s\n",
- subdir);
-
- num_pages += over_pages;
- if (num_pages < over_pages) /* overflow */
- num_pages = UINT32_MAX;
-
- /* we want to return a uint32_t and more than this looks suspicious
- * anyway ... */
- if (num_pages > UINT32_MAX)
- num_pages = UINT32_MAX;
-
- return num_pages;
-}
-
-static uint32_t
-get_num_hugepages_on_node(const char *subdir, unsigned int socket)
-{
- char path[PATH_MAX], socketpath[PATH_MAX];
- DIR *socketdir;
- unsigned long num_pages = 0;
- const char *nr_hp_file = "free_hugepages";
-
- snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
- sys_pages_numa_dir_path, socket);
-
- socketdir = opendir(socketpath);
- if (socketdir) {
- /* Keep calm and carry on */
- closedir(socketdir);
- } else {
- /* Can't find socket dir, so ignore it */
- return 0;
- }
-
- snprintf(path, sizeof(path), "%s/%s/%s",
- socketpath, subdir, nr_hp_file);
- if (eal_parse_sysfs_value(path, &num_pages) < 0)
- return 0;
-
- if (num_pages == 0)
- RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
- subdir);
-
- /*
- * we want to return a uint32_t and more than this looks suspicious
- * anyway ...
- */
- if (num_pages > UINT32_MAX)
- num_pages = UINT32_MAX;
-
- return num_pages;
-}
-
-static uint64_t
-get_default_hp_size(void)
-{
- const char proc_meminfo[] = "/proc/meminfo";
- const char str_hugepagesz[] = "Hugepagesize:";
- unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
- char buffer[256];
- unsigned long long size = 0;
-
- FILE *fd = fopen(proc_meminfo, "r");
- if (fd == NULL)
- rte_panic("Cannot open %s\n", proc_meminfo);
- while(fgets(buffer, sizeof(buffer), fd)){
- if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
- size = rte_str_to_size(&buffer[hugepagesz_len]);
- break;
- }
- }
- fclose(fd);
- if (size == 0)
- rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
- return size;
-}
-
-static int
-get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
-{
- enum proc_mount_fieldnames {
- DEVICE = 0,
- MOUNTPT,
- FSTYPE,
- OPTIONS,
- _FIELDNAME_MAX
- };
- static uint64_t default_size = 0;
- const char proc_mounts[] = "/proc/mounts";
- const char hugetlbfs_str[] = "hugetlbfs";
- const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
- const char pagesize_opt[] = "pagesize=";
- const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
- const char split_tok = ' ';
- char *splitstr[_FIELDNAME_MAX];
- char buf[BUFSIZ];
- int retval = -1;
-
- FILE *fd = fopen(proc_mounts, "r");
- if (fd == NULL)
- rte_panic("Cannot open %s\n", proc_mounts);
-
- if (default_size == 0)
- default_size = get_default_hp_size();
-
- while (fgets(buf, sizeof(buf), fd)){
- if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
- split_tok) != _FIELDNAME_MAX) {
- RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
- break; /* return NULL */
- }
-
- /* we have a specified --huge-dir option, only examine that dir */
- if (internal_config.hugepage_dir != NULL &&
- strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0)
- continue;
-
- if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){
- const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
-
- /* if no explicit page size, the default page size is compared */
- if (pagesz_str == NULL){
- if (hugepage_sz == default_size){
- strlcpy(hugedir, splitstr[MOUNTPT], len);
- retval = 0;
- break;
- }
- }
- /* there is an explicit page size, so check it */
- else {
- uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
- if (pagesz == hugepage_sz) {
- strlcpy(hugedir, splitstr[MOUNTPT], len);
- retval = 0;
- break;
- }
- }
- } /* end if strncmp hugetlbfs */
- } /* end while fgets */
-
- fclose(fd);
- return retval;
-}
-
-/*
- * Clear the hugepage directory of whatever hugepage files
- * there are. Checks if the file is locked (i.e.
- * if it's in use by another DPDK process).
- */
-static int
-clear_hugedir(const char * hugedir)
-{
- DIR *dir;
- struct dirent *dirent;
- int dir_fd, fd, lck_result;
- const char filter[] = "*map_*"; /* matches hugepage files */
-
- /* open directory */
- dir = opendir(hugedir);
- if (!dir) {
- RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
- hugedir);
- goto error;
- }
- dir_fd = dirfd(dir);
-
- dirent = readdir(dir);
- if (!dirent) {
- RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
- hugedir);
- goto error;
- }
-
- while(dirent != NULL){
- /* skip files that don't match the hugepage pattern */
- if (fnmatch(filter, dirent->d_name, 0) > 0) {
- dirent = readdir(dir);
- continue;
- }
-
- /* try and lock the file */
- fd = openat(dir_fd, dirent->d_name, O_RDONLY);
-
- /* skip to next file */
- if (fd == -1) {
- dirent = readdir(dir);
- continue;
- }
-
- /* non-blocking lock */
- lck_result = flock(fd, LOCK_EX | LOCK_NB);
-
- /* if lock succeeds, remove the file */
- if (lck_result != -1)
- unlinkat(dir_fd, dirent->d_name, 0);
- close (fd);
- dirent = readdir(dir);
- }
-
- closedir(dir);
- return 0;
-
-error:
- if (dir)
- closedir(dir);
-
- RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
- strerror(errno));
-
- return -1;
-}
-
-static int
-compare_hpi(const void *a, const void *b)
-{
- const struct hugepage_info *hpi_a = a;
- const struct hugepage_info *hpi_b = b;
-
- return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
-}
-
-static void
-calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
-{
- uint64_t total_pages = 0;
- unsigned int i;
-
- /*
- * first, try to put all hugepages into relevant sockets, but
- * if first attempts fails, fall back to collecting all pages
- * in one socket and sorting them later
- */
- total_pages = 0;
- /* we also don't want to do this for legacy init */
- if (!internal_config.legacy_mem)
- for (i = 0; i < rte_socket_count(); i++) {
- int socket = rte_socket_id_by_idx(i);
- unsigned int num_pages =
- get_num_hugepages_on_node(
- dirent->d_name, socket);
- hpi->num_pages[socket] = num_pages;
- total_pages += num_pages;
- }
- /*
- * we failed to sort memory from the get go, so fall
- * back to old way
- */
- if (total_pages == 0) {
- hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
-
-#ifndef RTE_ARCH_64
- /* for 32-bit systems, limit number of hugepages to
- * 1GB per page size */
- hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
- RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
- }
-}
-
-static int
-hugepage_info_init(void)
-{ const char dirent_start_text[] = "hugepages-";
- const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
- unsigned int i, num_sizes = 0;
- DIR *dir;
- struct dirent *dirent;
-
- dir = opendir(sys_dir_path);
- if (dir == NULL) {
- RTE_LOG(ERR, EAL,
- "Cannot open directory %s to read system hugepage info\n",
- sys_dir_path);
- return -1;
- }
-
- for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
- struct hugepage_info *hpi;
-
- if (strncmp(dirent->d_name, dirent_start_text,
- dirent_start_len) != 0)
- continue;
-
- if (num_sizes >= MAX_HUGEPAGE_SIZES)
- break;
-
- hpi = &internal_config.hugepage_info[num_sizes];
- hpi->hugepage_sz =
- rte_str_to_size(&dirent->d_name[dirent_start_len]);
-
- /* first, check if we have a mountpoint */
- if (get_hugepage_dir(hpi->hugepage_sz,
- hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
- uint32_t num_pages;
-
- num_pages = get_num_hugepages(dirent->d_name);
- if (num_pages > 0)
- RTE_LOG(NOTICE, EAL,
- "%" PRIu32 " hugepages of size "
- "%" PRIu64 " reserved, but no mounted "
- "hugetlbfs found for that size\n",
- num_pages, hpi->hugepage_sz);
- /* if we have kernel support for reserving hugepages
- * through mmap, and we're in in-memory mode, treat this
- * page size as valid. we cannot be in legacy mode at
- * this point because we've checked this earlier in the
- * init process.
- */
-#ifdef MAP_HUGE_SHIFT
- if (internal_config.in_memory) {
- RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
- "hugepages of size %" PRIu64 " bytes "
- "will be allocated anonymously\n",
- hpi->hugepage_sz);
- calc_num_pages(hpi, dirent);
- num_sizes++;
- }
-#endif
- continue;
- }
-
- /* try to obtain a writelock */
- hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
-
- /* if blocking lock failed */
- if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
- RTE_LOG(CRIT, EAL,
- "Failed to lock hugepage directory!\n");
- break;
- }
- /* clear out the hugepages dir from unused pages */
- if (clear_hugedir(hpi->hugedir) == -1)
- break;
-
- calc_num_pages(hpi, dirent);
-
- num_sizes++;
- }
- closedir(dir);
-
- /* something went wrong, and we broke from the for loop above */
- if (dirent != NULL)
- return -1;
-
- internal_config.num_hugepage_sizes = num_sizes;
-
- /* sort the page directory entries by size, largest to smallest */
- qsort(&internal_config.hugepage_info[0], num_sizes,
- sizeof(internal_config.hugepage_info[0]), compare_hpi);
-
- /* now we have all info, check we have at least one valid size */
- for (i = 0; i < num_sizes; i++) {
- /* pages may no longer all be on socket 0, so check all */
- unsigned int j, num_pages = 0;
- struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-
- for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
- num_pages += hpi->num_pages[j];
- if (num_pages > 0)
- return 0;
- }
-
- /* no valid hugepage mounts available, return error */
- return -1;
-}
-
-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
- struct hugepage_info *hpi, *tmp_hpi;
- unsigned int i;
-
- if (hugepage_info_init() < 0)
- return -1;
-
- /* for no shared files mode, we're done */
- if (internal_config.no_shconf)
- return 0;
-
- hpi = &internal_config.hugepage_info[0];
-
- tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
- sizeof(internal_config.hugepage_info));
- if (tmp_hpi == NULL) {
- RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
- return -1;
- }
-
- memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
-
- /* we've copied file descriptors along with everything else, but they
- * will be invalid in secondary process, so overwrite them
- */
- for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
- struct hugepage_info *tmp = &tmp_hpi[i];
- tmp->lock_descriptor = -1;
- }
-
- if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
- RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
- return -1;
- }
- return 0;
-}
-
-int eal_hugepage_info_read(void)
-{
- struct hugepage_info *hpi = &internal_config.hugepage_info[0];
- struct hugepage_info *tmp_hpi;
-
- tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
- sizeof(internal_config.hugepage_info));
- if (tmp_hpi == NULL) {
- RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
- return -1;
- }
-
- memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
-
- if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
- RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
- return -1;
- }
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <sys/queue.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <string.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <sys/epoll.h>
-#include <sys/signalfd.h>
-#include <sys/ioctl.h>
-#include <sys/eventfd.h>
-#include <assert.h>
-#include <stdbool.h>
-
-#include <rte_common.h>
-#include <rte_interrupts.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_atomic.h>
-#include <rte_branch_prediction.h>
-#include <rte_debug.h>
-#include <rte_log.h>
-#include <rte_errno.h>
-#include <rte_spinlock.h>
-#include <rte_pause.h>
-#include <rte_vfio.h>
-
-#include "eal_private.h"
-#include "eal_vfio.h"
-#include "eal_thread.h"
-
-#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
-#define NB_OTHER_INTR 1
-
-static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
-
-/**
- * union for pipe fds.
- */
-union intr_pipefds{
- struct {
- int pipefd[2];
- };
- struct {
- int readfd;
- int writefd;
- };
-};
-
-/**
- * union buffer for reading on different devices
- */
-union rte_intr_read_buffer {
- int uio_intr_count; /* for uio device */
-#ifdef VFIO_PRESENT
- uint64_t vfio_intr_count; /* for vfio device */
-#endif
- uint64_t timerfd_num; /* for timerfd */
- char charbuf[16]; /* for others */
-};
-
-TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
-TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
-
-struct rte_intr_callback {
- TAILQ_ENTRY(rte_intr_callback) next;
- rte_intr_callback_fn cb_fn; /**< callback address */
- void *cb_arg; /**< parameter for callback */
- uint8_t pending_delete; /**< delete after callback is called */
- rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
-};
-
-struct rte_intr_source {
- TAILQ_ENTRY(rte_intr_source) next;
- struct rte_intr_handle intr_handle; /**< interrupt handle */
- struct rte_intr_cb_list callbacks; /**< user callbacks */
- uint32_t active;
-};
-
-/* global spinlock for interrupt data operation */
-static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
-
-/* union buffer for pipe read/write */
-static union intr_pipefds intr_pipe;
-
-/* interrupt sources list */
-static struct rte_intr_source_list intr_sources;
-
-/* interrupt handling thread */
-static pthread_t intr_thread;
-
-/* VFIO interrupts */
-#ifdef VFIO_PRESENT
-
-#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int))
-/* irq set buffer length for queue interrupts and LSC interrupt */
-#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
- sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
-
-/* enable legacy (INTx) interrupts */
-static int
-vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
- struct vfio_irq_set *irq_set;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- int len, ret;
- int *fd_ptr;
-
- len = sizeof(irq_set_buf);
-
- /* enable INTx */
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
- irq_set->start = 0;
- fd_ptr = (int *) &irq_set->data;
- *fd_ptr = intr_handle->fd;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- /* unmask INTx after enabling */
- memset(irq_set, 0, len);
- len = sizeof(struct vfio_irq_set);
- irq_set->argsz = len;
- irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
- irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
- return 0;
-}
-
-/* disable legacy (INTx) interrupts */
-static int
-vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
- struct vfio_irq_set *irq_set;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- int len, ret;
-
- len = sizeof(struct vfio_irq_set);
-
- /* mask interrupts before disabling */
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
- irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- /* disable INTx*/
- memset(irq_set, 0, len);
- irq_set->argsz = len;
- irq_set->count = 0;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL,
- "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
- return -1;
- }
- return 0;
-}
-
-/* unmask/ack legacy (INTx) interrupts */
-static int
-vfio_ack_intx(const struct rte_intr_handle *intr_handle)
-{
- struct vfio_irq_set irq_set;
-
- /* unmask INTx */
- memset(&irq_set, 0, sizeof(irq_set));
- irq_set.argsz = sizeof(irq_set);
- irq_set.count = 1;
- irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
- irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
- irq_set.start = 0;
-
- if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
- RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
- return 0;
-}
-
-/* enable MSI interrupts */
-static int
-vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
- int len, ret;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- struct vfio_irq_set *irq_set;
- int *fd_ptr;
-
- len = sizeof(irq_set_buf);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
- irq_set->start = 0;
- fd_ptr = (int *) &irq_set->data;
- *fd_ptr = intr_handle->fd;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
- return 0;
-}
-
-/* disable MSI interrupts */
-static int
-vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
- struct vfio_irq_set *irq_set;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- int len, ret;
-
- len = sizeof(struct vfio_irq_set);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 0;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret)
- RTE_LOG(ERR, EAL,
- "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
-
- return ret;
-}
-
-/* enable MSI-X interrupts */
-static int
-vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
- int len, ret;
- char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
- struct vfio_irq_set *irq_set;
- int *fd_ptr;
-
- len = sizeof(irq_set_buf);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
- irq_set->count = intr_handle->max_intr ?
- (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
- RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
- irq_set->start = 0;
- fd_ptr = (int *) &irq_set->data;
- /* INTR vector offset 0 reserve for non-efds mapping */
- fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
- memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
- sizeof(*intr_handle->efds) * intr_handle->nb_efd);
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-/* disable MSI-X interrupts */
-static int
-vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
- struct vfio_irq_set *irq_set;
- char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
- int len, ret;
-
- len = sizeof(struct vfio_irq_set);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 0;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret)
- RTE_LOG(ERR, EAL,
- "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
-
- return ret;
-}
-
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
-/* enable req notifier */
-static int
-vfio_enable_req(const struct rte_intr_handle *intr_handle)
-{
- int len, ret;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- struct vfio_irq_set *irq_set;
- int *fd_ptr;
-
- len = sizeof(irq_set_buf);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 1;
- irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
- VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
- irq_set->start = 0;
- fd_ptr = (int *) &irq_set->data;
- *fd_ptr = intr_handle->fd;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret) {
- RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-/* disable req notifier */
-static int
-vfio_disable_req(const struct rte_intr_handle *intr_handle)
-{
- struct vfio_irq_set *irq_set;
- char irq_set_buf[IRQ_SET_BUF_LEN];
- int len, ret;
-
- len = sizeof(struct vfio_irq_set);
-
- irq_set = (struct vfio_irq_set *) irq_set_buf;
- irq_set->argsz = len;
- irq_set->count = 0;
- irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
- irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
- irq_set->start = 0;
-
- ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
- if (ret)
- RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
- intr_handle->fd);
-
- return ret;
-}
-#endif
-#endif
-
-static int
-uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
-{
- unsigned char command_high;
-
- /* use UIO config file descriptor for uio_pci_generic */
- if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
- RTE_LOG(ERR, EAL,
- "Error reading interrupts status for fd %d\n",
- intr_handle->uio_cfg_fd);
- return -1;
- }
- /* disable interrupts */
- command_high |= 0x4;
- if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
- RTE_LOG(ERR, EAL,
- "Error disabling interrupts for fd %d\n",
- intr_handle->uio_cfg_fd);
- return -1;
- }
-
- return 0;
-}
-
-static int
-uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
-{
- unsigned char command_high;
-
- /* use UIO config file descriptor for uio_pci_generic */
- if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
- RTE_LOG(ERR, EAL,
- "Error reading interrupts status for fd %d\n",
- intr_handle->uio_cfg_fd);
- return -1;
- }
- /* enable interrupts */
- command_high &= ~0x4;
- if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
- RTE_LOG(ERR, EAL,
- "Error enabling interrupts for fd %d\n",
- intr_handle->uio_cfg_fd);
- return -1;
- }
-
- return 0;
-}
-
-static int
-uio_intr_disable(const struct rte_intr_handle *intr_handle)
-{
- const int value = 0;
-
- if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
- RTE_LOG(ERR, EAL,
- "Error disabling interrupts for fd %d (%s)\n",
- intr_handle->fd, strerror(errno));
- return -1;
- }
- return 0;
-}
-
-static int
-uio_intr_enable(const struct rte_intr_handle *intr_handle)
-{
- const int value = 1;
-
- if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
- RTE_LOG(ERR, EAL,
- "Error enabling interrupts for fd %d (%s)\n",
- intr_handle->fd, strerror(errno));
- return -1;
- }
- return 0;
-}
-
-int
-rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb, void *cb_arg)
-{
- int ret, wake_thread;
- struct rte_intr_source *src;
- struct rte_intr_callback *callback;
-
- wake_thread = 0;
-
- /* first do parameter checking */
- if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
- RTE_LOG(ERR, EAL,
- "Registering with invalid input parameter\n");
- return -EINVAL;
- }
-
- /* allocate a new interrupt callback entity */
- callback = calloc(1, sizeof(*callback));
- if (callback == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- return -ENOMEM;
- }
- callback->cb_fn = cb;
- callback->cb_arg = cb_arg;
- callback->pending_delete = 0;
- callback->ucb_fn = NULL;
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if there is at least one callback registered for the fd */
- TAILQ_FOREACH(src, &intr_sources, next) {
- if (src->intr_handle.fd == intr_handle->fd) {
- /* we had no interrupts for this */
- if (TAILQ_EMPTY(&src->callbacks))
- wake_thread = 1;
-
- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
- ret = 0;
- break;
- }
- }
-
- /* no existing callbacks for this - add new source */
- if (src == NULL) {
- src = calloc(1, sizeof(*src));
- if (src == NULL) {
- RTE_LOG(ERR, EAL, "Can not allocate memory\n");
- free(callback);
- ret = -ENOMEM;
- } else {
- src->intr_handle = *intr_handle;
- TAILQ_INIT(&src->callbacks);
- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
- TAILQ_INSERT_TAIL(&intr_sources, src, next);
- wake_thread = 1;
- ret = 0;
- }
- }
-
- rte_spinlock_unlock(&intr_lock);
-
- /**
- * check if need to notify the pipe fd waited by epoll_wait to
- * rebuild the wait list.
- */
- if (wake_thread)
- if (write(intr_pipe.writefd, "1", 1) < 0)
- return -EPIPE;
-
- return ret;
-}
-
-int
-rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb_fn, void *cb_arg,
- rte_intr_unregister_callback_fn ucb_fn)
-{
- int ret;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb, *next;
-
- /* do parameter checking first */
- if (intr_handle == NULL || intr_handle->fd < 0) {
- RTE_LOG(ERR, EAL,
- "Unregistering with invalid input parameter\n");
- return -EINVAL;
- }
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if the insterrupt source for the fd is existent */
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd == intr_handle->fd)
- break;
-
- /* No interrupt source registered for the fd */
- if (src == NULL) {
- ret = -ENOENT;
-
- /* only usable if the source is active */
- } else if (src->active == 0) {
- ret = -EAGAIN;
-
- } else {
- ret = 0;
-
- /* walk through the callbacks and mark all that match. */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
- next = TAILQ_NEXT(cb, next);
- if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
- cb->cb_arg == cb_arg)) {
- cb->pending_delete = 1;
- cb->ucb_fn = ucb_fn;
- ret++;
- }
- }
- }
-
- rte_spinlock_unlock(&intr_lock);
-
- return ret;
-}
-
-int
-rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
- rte_intr_callback_fn cb_fn, void *cb_arg)
-{
- int ret;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb, *next;
-
- /* do parameter checking first */
- if (intr_handle == NULL || intr_handle->fd < 0) {
- RTE_LOG(ERR, EAL,
- "Unregistering with invalid input parameter\n");
- return -EINVAL;
- }
-
- rte_spinlock_lock(&intr_lock);
-
- /* check if the insterrupt source for the fd is existent */
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd == intr_handle->fd)
- break;
-
- /* No interrupt source registered for the fd */
- if (src == NULL) {
- ret = -ENOENT;
-
- /* interrupt source has some active callbacks right now. */
- } else if (src->active != 0) {
- ret = -EAGAIN;
-
- /* ok to remove. */
- } else {
- ret = 0;
-
- /*walk through the callbacks and remove all that match. */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
-
- next = TAILQ_NEXT(cb, next);
-
- if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
- cb->cb_arg == cb_arg)) {
- TAILQ_REMOVE(&src->callbacks, cb, next);
- free(cb);
- ret++;
- }
- }
-
- /* all callbacks for that source are removed. */
- if (TAILQ_EMPTY(&src->callbacks)) {
- TAILQ_REMOVE(&intr_sources, src, next);
- free(src);
- }
- }
-
- rte_spinlock_unlock(&intr_lock);
-
- /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
- if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
- ret = -EPIPE;
- }
-
- return ret;
-}
-
-int
-rte_intr_enable(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
- return -1;
-
- switch (intr_handle->type){
- /* write to the uio fd to enable the interrupt */
- case RTE_INTR_HANDLE_UIO:
- if (uio_intr_enable(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_UIO_INTX:
- if (uio_intx_intr_enable(intr_handle))
- return -1;
- break;
- /* not used at this moment */
- case RTE_INTR_HANDLE_ALARM:
- return -1;
-#ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- if (vfio_enable_msix(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_VFIO_MSI:
- if (vfio_enable_msi(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- if (vfio_enable_intx(intr_handle))
- return -1;
- break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
- case RTE_INTR_HANDLE_VFIO_REQ:
- if (vfio_enable_req(intr_handle))
- return -1;
- break;
-#endif
-#endif
- /* not used at this moment */
- case RTE_INTR_HANDLE_DEV_EVENT:
- return -1;
- /* unknown handle type */
- default:
- RTE_LOG(ERR, EAL,
- "Unknown handle type of fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-/**
- * PMD generally calls this function at the end of its IRQ callback.
- * Internally, it unmasks the interrupt if possible.
- *
- * For INTx, unmasking is required as the interrupt is auto-masked prior to
- * invoking callback.
- *
- * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
- * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
- * this function is no-op.
- */
-int
-rte_intr_ack(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
- return -1;
-
- switch (intr_handle->type) {
- /* Both acking and enabling are same for UIO */
- case RTE_INTR_HANDLE_UIO:
- if (uio_intr_enable(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_UIO_INTX:
- if (uio_intx_intr_enable(intr_handle))
- return -1;
- break;
- /* not used at this moment */
- case RTE_INTR_HANDLE_ALARM:
- return -1;
-#ifdef VFIO_PRESENT
- /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
- case RTE_INTR_HANDLE_VFIO_MSIX:
- case RTE_INTR_HANDLE_VFIO_MSI:
- return 0;
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- if (vfio_ack_intx(intr_handle))
- return -1;
- break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
- case RTE_INTR_HANDLE_VFIO_REQ:
- return -1;
-#endif
-#endif
- /* not used at this moment */
- case RTE_INTR_HANDLE_DEV_EVENT:
- return -1;
- /* unknown handle type */
- default:
- RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-int
-rte_intr_disable(const struct rte_intr_handle *intr_handle)
-{
- if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 0;
-
- if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
- return -1;
-
- switch (intr_handle->type){
- /* write to the uio fd to disable the interrupt */
- case RTE_INTR_HANDLE_UIO:
- if (uio_intr_disable(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_UIO_INTX:
- if (uio_intx_intr_disable(intr_handle))
- return -1;
- break;
- /* not used at this moment */
- case RTE_INTR_HANDLE_ALARM:
- return -1;
-#ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- if (vfio_disable_msix(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_VFIO_MSI:
- if (vfio_disable_msi(intr_handle))
- return -1;
- break;
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- if (vfio_disable_intx(intr_handle))
- return -1;
- break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
- case RTE_INTR_HANDLE_VFIO_REQ:
- if (vfio_disable_req(intr_handle))
- return -1;
- break;
-#endif
-#endif
- /* not used at this moment */
- case RTE_INTR_HANDLE_DEV_EVENT:
- return -1;
- /* unknown handle type */
- default:
- RTE_LOG(ERR, EAL,
- "Unknown handle type of fd %d\n",
- intr_handle->fd);
- return -1;
- }
-
- return 0;
-}
-
-static int
-eal_intr_process_interrupts(struct epoll_event *events, int nfds)
-{
- bool call = false;
- int n, bytes_read, rv;
- struct rte_intr_source *src;
- struct rte_intr_callback *cb, *next;
- union rte_intr_read_buffer buf;
- struct rte_intr_callback active_cb;
-
- for (n = 0; n < nfds; n++) {
-
- /**
- * if the pipe fd is ready to read, return out to
- * rebuild the wait list.
- */
- if (events[n].data.fd == intr_pipe.readfd){
- int r = read(intr_pipe.readfd, buf.charbuf,
- sizeof(buf.charbuf));
- RTE_SET_USED(r);
- return -1;
- }
- rte_spinlock_lock(&intr_lock);
- TAILQ_FOREACH(src, &intr_sources, next)
- if (src->intr_handle.fd ==
- events[n].data.fd)
- break;
- if (src == NULL){
- rte_spinlock_unlock(&intr_lock);
- continue;
- }
-
- /* mark this interrupt source as active and release the lock. */
- src->active = 1;
- rte_spinlock_unlock(&intr_lock);
-
- /* set the length to be read dor different handle type */
- switch (src->intr_handle.type) {
- case RTE_INTR_HANDLE_UIO:
- case RTE_INTR_HANDLE_UIO_INTX:
- bytes_read = sizeof(buf.uio_intr_count);
- break;
- case RTE_INTR_HANDLE_ALARM:
- bytes_read = sizeof(buf.timerfd_num);
- break;
-#ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- case RTE_INTR_HANDLE_VFIO_MSI:
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- bytes_read = sizeof(buf.vfio_intr_count);
- break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
- case RTE_INTR_HANDLE_VFIO_REQ:
- bytes_read = 0;
- call = true;
- break;
-#endif
-#endif
- case RTE_INTR_HANDLE_VDEV:
- case RTE_INTR_HANDLE_EXT:
- bytes_read = 0;
- call = true;
- break;
- case RTE_INTR_HANDLE_DEV_EVENT:
- bytes_read = 0;
- call = true;
- break;
- default:
- bytes_read = 1;
- break;
- }
-
- if (bytes_read > 0) {
- /**
- * read out to clear the ready-to-be-read flag
- * for epoll_wait.
- */
- bytes_read = read(events[n].data.fd, &buf, bytes_read);
- if (bytes_read < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK)
- continue;
-
- RTE_LOG(ERR, EAL, "Error reading from file "
- "descriptor %d: %s\n",
- events[n].data.fd,
- strerror(errno));
- /*
- * The device is unplugged or buggy, remove
- * it as an interrupt source and return to
- * force the wait list to be rebuilt.
- */
- rte_spinlock_lock(&intr_lock);
- TAILQ_REMOVE(&intr_sources, src, next);
- rte_spinlock_unlock(&intr_lock);
-
- for (cb = TAILQ_FIRST(&src->callbacks); cb;
- cb = next) {
- next = TAILQ_NEXT(cb, next);
- TAILQ_REMOVE(&src->callbacks, cb, next);
- free(cb);
- }
- free(src);
- return -1;
- } else if (bytes_read == 0)
- RTE_LOG(ERR, EAL, "Read nothing from file "
- "descriptor %d\n", events[n].data.fd);
- else
- call = true;
- }
-
- /* grab a lock, again to call callbacks and update status. */
- rte_spinlock_lock(&intr_lock);
-
- if (call) {
-
- /* Finally, call all callbacks. */
- TAILQ_FOREACH(cb, &src->callbacks, next) {
-
- /* make a copy and unlock. */
- active_cb = *cb;
- rte_spinlock_unlock(&intr_lock);
-
- /* call the actual callback */
- active_cb.cb_fn(active_cb.cb_arg);
-
- /*get the lock back. */
- rte_spinlock_lock(&intr_lock);
- }
- }
- /* we done with that interrupt source, release it. */
- src->active = 0;
-
- rv = 0;
-
- /* check if any callback are supposed to be removed */
- for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
- next = TAILQ_NEXT(cb, next);
- if (cb->pending_delete) {
- TAILQ_REMOVE(&src->callbacks, cb, next);
- if (cb->ucb_fn)
- cb->ucb_fn(&src->intr_handle, cb->cb_arg);
- free(cb);
- rv++;
- }
- }
-
- /* all callbacks for that source are removed. */
- if (TAILQ_EMPTY(&src->callbacks)) {
- TAILQ_REMOVE(&intr_sources, src, next);
- free(src);
- }
-
- /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
- if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
- rte_spinlock_unlock(&intr_lock);
- return -EPIPE;
- }
-
- rte_spinlock_unlock(&intr_lock);
- }
-
- return 0;
-}
-
-/**
- * It handles all the interrupts.
- *
- * @param pfd
- * epoll file descriptor.
- * @param totalfds
- * The number of file descriptors added in epoll.
- *
- * @return
- * void
- */
-static void
-eal_intr_handle_interrupts(int pfd, unsigned totalfds)
-{
- struct epoll_event events[totalfds];
- int nfds = 0;
-
- for(;;) {
- nfds = epoll_wait(pfd, events, totalfds,
- EAL_INTR_EPOLL_WAIT_FOREVER);
- /* epoll_wait fail */
- if (nfds < 0) {
- if (errno == EINTR)
- continue;
- RTE_LOG(ERR, EAL,
- "epoll_wait returns with fail\n");
- return;
- }
- /* epoll_wait timeout, will never happens here */
- else if (nfds == 0)
- continue;
- /* epoll_wait has at least one fd ready to read */
- if (eal_intr_process_interrupts(events, nfds) < 0)
- return;
- }
-}
-
-/**
- * It builds/rebuilds up the epoll file descriptor with all the
- * file descriptors being waited on. Then handles the interrupts.
- *
- * @param arg
- * pointer. (unused)
- *
- * @return
- * never return;
- */
-static __attribute__((noreturn)) void *
-eal_intr_thread_main(__rte_unused void *arg)
-{
- /* host thread, never break out */
- for (;;) {
- /* build up the epoll fd with all descriptors we are to
- * wait on then pass it to the handle_interrupts function
- */
- static struct epoll_event pipe_event = {
- .events = EPOLLIN | EPOLLPRI,
- };
- struct rte_intr_source *src;
- unsigned numfds = 0;
-
- /* create epoll fd */
- int pfd = epoll_create(1);
- if (pfd < 0)
- rte_panic("Cannot create epoll instance\n");
-
- pipe_event.data.fd = intr_pipe.readfd;
- /**
- * add pipe fd into wait list, this pipe is used to
- * rebuild the wait list.
- */
- if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
- &pipe_event) < 0) {
- rte_panic("Error adding fd to %d epoll_ctl, %s\n",
- intr_pipe.readfd, strerror(errno));
- }
- numfds++;
-
- rte_spinlock_lock(&intr_lock);
-
- TAILQ_FOREACH(src, &intr_sources, next) {
- struct epoll_event ev;
-
- if (src->callbacks.tqh_first == NULL)
- continue; /* skip those with no callbacks */
- memset(&ev, 0, sizeof(ev));
- ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
- ev.data.fd = src->intr_handle.fd;
-
- /**
- * add all the uio device file descriptor
- * into wait list.
- */
- if (epoll_ctl(pfd, EPOLL_CTL_ADD,
- src->intr_handle.fd, &ev) < 0){
- rte_panic("Error adding fd %d epoll_ctl, %s\n",
- src->intr_handle.fd, strerror(errno));
- }
- else
- numfds++;
- }
- rte_spinlock_unlock(&intr_lock);
- /* serve the interrupt */
- eal_intr_handle_interrupts(pfd, numfds);
-
- /**
- * when we return, we need to rebuild the
- * list of fds to monitor.
- */
- close(pfd);
- }
-}
-
-int
-rte_eal_intr_init(void)
-{
- int ret = 0;
-
- /* init the global interrupt source head */
- TAILQ_INIT(&intr_sources);
-
- /**
- * create a pipe which will be waited by epoll and notified to
- * rebuild the wait list of epoll.
- */
- if (pipe(intr_pipe.pipefd) < 0) {
- rte_errno = errno;
- return -1;
- }
-
- /* create the host thread to wait/handle the interrupt */
- ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
- eal_intr_thread_main, NULL);
- if (ret != 0) {
- rte_errno = -ret;
- RTE_LOG(ERR, EAL,
- "Failed to create thread for interrupt handling\n");
- }
-
- return ret;
-}
-
-static void
-eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
-{
- union rte_intr_read_buffer buf;
- int bytes_read = 0;
- int nbytes;
-
- switch (intr_handle->type) {
- case RTE_INTR_HANDLE_UIO:
- case RTE_INTR_HANDLE_UIO_INTX:
- bytes_read = sizeof(buf.uio_intr_count);
- break;
-#ifdef VFIO_PRESENT
- case RTE_INTR_HANDLE_VFIO_MSIX:
- case RTE_INTR_HANDLE_VFIO_MSI:
- case RTE_INTR_HANDLE_VFIO_LEGACY:
- bytes_read = sizeof(buf.vfio_intr_count);
- break;
-#endif
- case RTE_INTR_HANDLE_VDEV:
- bytes_read = intr_handle->efd_counter_size;
- /* For vdev, number of bytes to read is set by driver */
- break;
- case RTE_INTR_HANDLE_EXT:
- return;
- default:
- bytes_read = 1;
- RTE_LOG(INFO, EAL, "unexpected intr type\n");
- break;
- }
-
- /**
- * read out to clear the ready-to-be-read flag
- * for epoll_wait.
- */
- if (bytes_read == 0)
- return;
- do {
- nbytes = read(fd, &buf, bytes_read);
- if (nbytes < 0) {
- if (errno == EINTR || errno == EWOULDBLOCK ||
- errno == EAGAIN)
- continue;
- RTE_LOG(ERR, EAL,
- "Error reading from fd %d: %s\n",
- fd, strerror(errno));
- } else if (nbytes == 0)
- RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
- return;
- } while (1);
-}
-
-static int
-eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
- struct rte_epoll_event *events)
-{
- unsigned int i, count = 0;
- struct rte_epoll_event *rev;
-
- for (i = 0; i < n; i++) {
- rev = evs[i].data.ptr;
- if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
- RTE_EPOLL_EXEC))
- continue;
-
- events[count].status = RTE_EPOLL_VALID;
- events[count].fd = rev->fd;
- events[count].epfd = rev->epfd;
- events[count].epdata.event = rev->epdata.event;
- events[count].epdata.data = rev->epdata.data;
- if (rev->epdata.cb_fun)
- rev->epdata.cb_fun(rev->fd,
- rev->epdata.cb_arg);
-
- rte_compiler_barrier();
- rev->status = RTE_EPOLL_VALID;
- count++;
- }
- return count;
-}
-
-static inline int
-eal_init_tls_epfd(void)
-{
- int pfd = epoll_create(255);
-
- if (pfd < 0) {
- RTE_LOG(ERR, EAL,
- "Cannot create epoll instance\n");
- return -1;
- }
- return pfd;
-}
-
-int
-rte_intr_tls_epfd(void)
-{
- if (RTE_PER_LCORE(_epfd) == -1)
- RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
-
- return RTE_PER_LCORE(_epfd);
-}
-
-int
-rte_epoll_wait(int epfd, struct rte_epoll_event *events,
- int maxevents, int timeout)
-{
- struct epoll_event evs[maxevents];
- int rc;
-
- if (!events) {
- RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
- return -1;
- }
-
- /* using per thread epoll fd */
- if (epfd == RTE_EPOLL_PER_THREAD)
- epfd = rte_intr_tls_epfd();
-
- while (1) {
- rc = epoll_wait(epfd, evs, maxevents, timeout);
- if (likely(rc > 0)) {
- /* epoll_wait has at least one fd ready to read */
- rc = eal_epoll_process_event(evs, rc, events);
- break;
- } else if (rc < 0) {
- if (errno == EINTR)
- continue;
- /* epoll_wait fail */
- RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
- strerror(errno));
- rc = -1;
- break;
- } else {
- /* rc == 0, epoll_wait timed out */
- break;
- }
- }
-
- return rc;
-}
-
-static inline void
-eal_epoll_data_safe_free(struct rte_epoll_event *ev)
-{
- while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
- RTE_EPOLL_INVALID))
- while (ev->status != RTE_EPOLL_VALID)
- rte_pause();
- memset(&ev->epdata, 0, sizeof(ev->epdata));
- ev->fd = -1;
- ev->epfd = -1;
-}
-
-int
-rte_epoll_ctl(int epfd, int op, int fd,
- struct rte_epoll_event *event)
-{
- struct epoll_event ev;
-
- if (!event) {
- RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
- return -1;
- }
-
- /* using per thread epoll fd */
- if (epfd == RTE_EPOLL_PER_THREAD)
- epfd = rte_intr_tls_epfd();
-
- if (op == EPOLL_CTL_ADD) {
- event->status = RTE_EPOLL_VALID;
- event->fd = fd; /* ignore fd in event */
- event->epfd = epfd;
- ev.data.ptr = (void *)event;
- }
-
- ev.events = event->epdata.event;
- if (epoll_ctl(epfd, op, fd, &ev) < 0) {
- RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
- op, fd, strerror(errno));
- if (op == EPOLL_CTL_ADD)
- /* rollback status when CTL_ADD fail */
- event->status = RTE_EPOLL_INVALID;
- return -1;
- }
-
- if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
- eal_epoll_data_safe_free(event);
-
- return 0;
-}
-
-int
-rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
- int op, unsigned int vec, void *data)
-{
- struct rte_epoll_event *rev;
- struct rte_epoll_data *epdata;
- int epfd_op;
- unsigned int efd_idx;
- int rc = 0;
-
- efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
- (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
-
- if (!intr_handle || intr_handle->nb_efd == 0 ||
- efd_idx >= intr_handle->nb_efd) {
- RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
- return -EPERM;
- }
-
- switch (op) {
- case RTE_INTR_EVENT_ADD:
- epfd_op = EPOLL_CTL_ADD;
- rev = &intr_handle->elist[efd_idx];
- if (rev->status != RTE_EPOLL_INVALID) {
- RTE_LOG(INFO, EAL, "Event already been added.\n");
- return -EEXIST;
- }
-
- /* attach to intr vector fd */
- epdata = &rev->epdata;
- epdata->event = EPOLLIN | EPOLLPRI | EPOLLET;
- epdata->data = data;
- epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
- epdata->cb_arg = (void *)intr_handle;
- rc = rte_epoll_ctl(epfd, epfd_op,
- intr_handle->efds[efd_idx], rev);
- if (!rc)
- RTE_LOG(DEBUG, EAL,
- "efd %d associated with vec %d added on epfd %d"
- "\n", rev->fd, vec, epfd);
- else
- rc = -EPERM;
- break;
- case RTE_INTR_EVENT_DEL:
- epfd_op = EPOLL_CTL_DEL;
- rev = &intr_handle->elist[efd_idx];
- if (rev->status == RTE_EPOLL_INVALID) {
- RTE_LOG(INFO, EAL, "Event does not exist.\n");
- return -EPERM;
- }
-
- rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
- if (rc)
- rc = -EPERM;
- break;
- default:
- RTE_LOG(ERR, EAL, "event op type mismatch\n");
- rc = -EPERM;
- }
-
- return rc;
-}
-
-void
-rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
-{
- uint32_t i;
- struct rte_epoll_event *rev;
-
- for (i = 0; i < intr_handle->nb_efd; i++) {
- rev = &intr_handle->elist[i];
- if (rev->status == RTE_EPOLL_INVALID)
- continue;
- if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
- /* force free if the entry valid */
- eal_epoll_data_safe_free(rev);
- rev->status = RTE_EPOLL_INVALID;
- }
- }
-}
-
-int
-rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
-{
- uint32_t i;
- int fd;
- uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
-
- assert(nb_efd != 0);
-
- if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
- for (i = 0; i < n; i++) {
- fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
- if (fd < 0) {
- RTE_LOG(ERR, EAL,
- "can't setup eventfd, error %i (%s)\n",
- errno, strerror(errno));
- return -errno;
- }
- intr_handle->efds[i] = fd;
- }
- intr_handle->nb_efd = n;
- intr_handle->max_intr = NB_OTHER_INTR + n;
- } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
- /* only check, initialization would be done in vdev driver.*/
- if (intr_handle->efd_counter_size >
- sizeof(union rte_intr_read_buffer)) {
- RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
- return -EINVAL;
- }
- } else {
- intr_handle->efds[0] = intr_handle->fd;
- intr_handle->nb_efd = RTE_MIN(nb_efd, 1U);
- intr_handle->max_intr = NB_OTHER_INTR;
- }
-
- return 0;
-}
-
-void
-rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
-{
- uint32_t i;
-
- rte_intr_free_epoll_fd(intr_handle);
- if (intr_handle->max_intr > intr_handle->nb_efd) {
- for (i = 0; i < intr_handle->nb_efd; i++)
- close(intr_handle->efds[i]);
- }
- intr_handle->nb_efd = 0;
- intr_handle->max_intr = 0;
-}
-
-int
-rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
-{
- return !(!intr_handle->nb_efd);
-}
-
-int
-rte_intr_allow_others(struct rte_intr_handle *intr_handle)
-{
- if (!rte_intr_dp_is_en(intr_handle))
- return 1;
- else
- return !!(intr_handle->max_intr - intr_handle->nb_efd);
-}
-
-int
-rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
-{
- if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
- return 1;
-
- if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
- return 1;
-
- return 0;
-}
-
-int rte_thread_is_intr(void)
-{
- return pthread_equal(intr_thread, pthread_self());
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <unistd.h>
-#include <limits.h>
-#include <string.h>
-#include <dirent.h>
-
-#include <rte_log.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_common.h>
-#include <rte_string_fns.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_filesystem.h"
-#include "eal_thread.h"
-
-#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u"
-#define CORE_ID_FILE "topology/core_id"
-#define NUMA_NODE_PATH "/sys/devices/system/node"
-
-/* Check if a cpu is present by the presence of the cpu information for it */
-int
-eal_cpu_detected(unsigned lcore_id)
-{
- char path[PATH_MAX];
- int len = snprintf(path, sizeof(path), SYS_CPU_DIR
- "/"CORE_ID_FILE, lcore_id);
- if (len <= 0 || (unsigned)len >= sizeof(path))
- return 0;
- if (access(path, F_OK) != 0)
- return 0;
-
- return 1;
-}
-
-/*
- * Get CPU socket id (NUMA node) for a logical core.
- *
- * This searches each nodeX directories in /sys for the symlink for the given
- * lcore_id and returns the numa node where the lcore is found. If lcore is not
- * found on any numa node, returns zero.
- */
-unsigned
-eal_cpu_socket_id(unsigned lcore_id)
-{
- unsigned socket;
-
- for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
- char path[PATH_MAX];
-
- snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
- socket, lcore_id);
- if (access(path, F_OK) == 0)
- return socket;
- }
- return 0;
-}
-
-/* Get the cpu core id value from the /sys/.../cpuX core_id value */
-unsigned
-eal_cpu_core_id(unsigned lcore_id)
-{
- char path[PATH_MAX];
- unsigned long id;
-
- int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE);
- if (len <= 0 || (unsigned)len >= sizeof(path))
- goto err;
- if (eal_parse_sysfs_value(path, &id) != 0)
- goto err;
- return (unsigned)id;
-
-err:
- RTE_LOG(ERR, EAL, "Error reading core id value from %s "
- "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id);
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <syslog.h>
-#include <sys/queue.h>
-
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_spinlock.h>
-#include <rte_log.h>
-
-#include "eal_private.h"
-
-/*
- * default log function
- */
-static ssize_t
-console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
-{
- ssize_t ret;
-
- /* write on stdout */
- ret = fwrite(buf, 1, size, stdout);
- fflush(stdout);
-
- /* Syslog error levels are from 0 to 7, so subtract 1 to convert */
- syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf);
-
- return ret;
-}
-
-static cookie_io_functions_t console_log_func = {
- .write = console_log_write,
-};
-
-/*
- * set the log to default function, called during eal init process,
- * once memzones are available.
- */
-int
-rte_eal_log_init(const char *id, int facility)
-{
- FILE *log_stream;
-
- log_stream = fopencookie(NULL, "w+", console_log_func);
- if (log_stream == NULL)
- return -1;
-
- openlog(id, LOG_NDELAY | LOG_PID, facility);
-
- eal_log_set_default(log_stream);
-
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017-2018 Intel Corporation
- */
-
-#include <errno.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/queue.h>
-#include <sys/file.h>
-#include <unistd.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <signal.h>
-#include <setjmp.h>
-#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
-#include <linux/memfd.h>
-#define MEMFD_SUPPORTED
-#endif
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-#include <numa.h>
-#include <numaif.h>
-#endif
-#include <linux/falloc.h>
-#include <linux/mman.h> /* for hugetlb-related mmap flags */
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_eal.h>
-#include <rte_errno.h>
-#include <rte_memory.h>
-#include <rte_spinlock.h>
-
-#include "eal_filesystem.h"
-#include "eal_internal_cfg.h"
-#include "eal_memalloc.h"
-#include "eal_memcfg.h"
-#include "eal_private.h"
-
-const int anonymous_hugepages_supported =
-#ifdef MAP_HUGE_SHIFT
- 1;
-#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
-#else
- 0;
-#define RTE_MAP_HUGE_SHIFT 26
-#endif
-
-/*
- * we've already checked memfd support at compile-time, but we also need to
- * check if we can create hugepage files with memfd.
- *
- * also, this is not a constant, because while we may be *compiled* with memfd
- * hugetlbfs support, we might not be *running* on a system that supports memfd
- * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
- * runtime, and fall back to anonymous memory.
- */
-static int memfd_create_supported =
-#ifdef MFD_HUGETLB
- 1;
-#define RTE_MFD_HUGETLB MFD_HUGETLB
-#else
- 0;
-#define RTE_MFD_HUGETLB 4U
-#endif
-
-/*
- * not all kernel version support fallocate on hugetlbfs, so fall back to
- * ftruncate and disallow deallocation if fallocate is not supported.
- */
-static int fallocate_supported = -1; /* unknown */
-
-/*
- * we have two modes - single file segments, and file-per-page mode.
- *
- * for single-file segments, we use memseg_list_fd to store the segment fd,
- * while the fds[] will not be allocated, and len will be set to 0.
- *
- * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
- * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
- *
- * we cannot know how many pages a system will have in advance, but we do know
- * that they come in lists, and we know lengths of these lists. so, simply store
- * a malloc'd array of fd's indexed by list and segment index.
- *
- * they will be initialized at startup, and filled as we allocate/deallocate
- * segments.
- */
-static struct {
- int *fds; /**< dynamically allocated array of segment lock fd's */
- int memseg_list_fd; /**< memseg list fd */
- int len; /**< total length of the array */
- int count; /**< entries used in an array */
-} fd_list[RTE_MAX_MEMSEG_LISTS];
-
-/** local copy of a memory map, used to synchronize memory hotplug in MP */
-static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
-
-static sigjmp_buf huge_jmpenv;
-
-static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
-{
- siglongjmp(huge_jmpenv, 1);
-}
-
-/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
- * non-static local variable in the stack frame calling sigsetjmp might be
- * clobbered by a call to longjmp.
- */
-static int __rte_unused huge_wrap_sigsetjmp(void)
-{
- return sigsetjmp(huge_jmpenv, 1);
-}
-
-static struct sigaction huge_action_old;
-static int huge_need_recover;
-
-static void __rte_unused
-huge_register_sigbus(void)
-{
- sigset_t mask;
- struct sigaction action;
-
- sigemptyset(&mask);
- sigaddset(&mask, SIGBUS);
- action.sa_flags = 0;
- action.sa_mask = mask;
- action.sa_handler = huge_sigbus_handler;
-
- huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
-}
-
-static void __rte_unused
-huge_recover_sigbus(void)
-{
- if (huge_need_recover) {
- sigaction(SIGBUS, &huge_action_old, NULL);
- huge_need_recover = 0;
- }
-}
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-static bool
-check_numa(void)
-{
- bool ret = true;
- /* Check if kernel supports NUMA. */
- if (numa_available() != 0) {
- RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
- ret = false;
- }
- return ret;
-}
-
-static void
-prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
-{
- RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
- if (get_mempolicy(oldpolicy, oldmask->maskp,
- oldmask->size + 1, 0, 0) < 0) {
- RTE_LOG(ERR, EAL,
- "Failed to get current mempolicy: %s. "
- "Assuming MPOL_DEFAULT.\n", strerror(errno));
- *oldpolicy = MPOL_DEFAULT;
- }
- RTE_LOG(DEBUG, EAL,
- "Setting policy MPOL_PREFERRED for socket %d\n",
- socket_id);
- numa_set_preferred(socket_id);
-}
-
-static void
-restore_numa(int *oldpolicy, struct bitmask *oldmask)
-{
- RTE_LOG(DEBUG, EAL,
- "Restoring previous memory policy: %d\n", *oldpolicy);
- if (*oldpolicy == MPOL_DEFAULT) {
- numa_set_localalloc();
- } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
- oldmask->size + 1) < 0) {
- RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
- strerror(errno));
- numa_set_localalloc();
- }
- numa_free_cpumask(oldmask);
-}
-#endif
-
-/*
- * uses fstat to report the size of a file on disk
- */
-static off_t
-get_file_size(int fd)
-{
- struct stat st;
- if (fstat(fd, &st) < 0)
- return 0;
- return st.st_size;
-}
-
-static int
-pagesz_flags(uint64_t page_sz)
-{
- /* as per mmap() manpage, all page sizes are log2 of page size
- * shifted by MAP_HUGE_SHIFT
- */
- int log2 = rte_log2_u64(page_sz);
- return log2 << RTE_MAP_HUGE_SHIFT;
-}
-
-/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
-static int lock(int fd, int type)
-{
- int ret;
-
- /* flock may be interrupted */
- do {
- ret = flock(fd, type | LOCK_NB);
- } while (ret && errno == EINTR);
-
- if (ret && errno == EWOULDBLOCK) {
- /* couldn't lock */
- return 0;
- } else if (ret) {
- RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
- __func__, strerror(errno));
- return -1;
- }
- /* lock was successful */
- return 1;
-}
-
-static int
-get_seg_memfd(struct hugepage_info *hi __rte_unused,
- unsigned int list_idx __rte_unused,
- unsigned int seg_idx __rte_unused)
-{
-#ifdef MEMFD_SUPPORTED
- int fd;
- char segname[250]; /* as per manpage, limit is 249 bytes plus null */
-
- int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
-
- if (internal_config.single_file_segments) {
- fd = fd_list[list_idx].memseg_list_fd;
-
- if (fd < 0) {
- snprintf(segname, sizeof(segname), "seg_%i", list_idx);
- fd = memfd_create(segname, flags);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
- __func__, strerror(errno));
- return -1;
- }
- fd_list[list_idx].memseg_list_fd = fd;
- }
- } else {
- fd = fd_list[list_idx].fds[seg_idx];
-
- if (fd < 0) {
- snprintf(segname, sizeof(segname), "seg_%i-%i",
- list_idx, seg_idx);
- fd = memfd_create(segname, flags);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
- __func__, strerror(errno));
- return -1;
- }
- fd_list[list_idx].fds[seg_idx] = fd;
- }
- }
- return fd;
-#endif
- return -1;
-}
-
-static int
-get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
- unsigned int list_idx, unsigned int seg_idx)
-{
- int fd;
-
- /* for in-memory mode, we only make it here when we're sure we support
- * memfd, and this is a special case.
- */
- if (internal_config.in_memory)
- return get_seg_memfd(hi, list_idx, seg_idx);
-
- if (internal_config.single_file_segments) {
- /* create a hugepage file path */
- eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
-
- fd = fd_list[list_idx].memseg_list_fd;
-
- if (fd < 0) {
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
- __func__, strerror(errno));
- return -1;
- }
- /* take out a read lock and keep it indefinitely */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
- }
- fd_list[list_idx].memseg_list_fd = fd;
- }
- } else {
- /* create a hugepage file path */
- eal_get_hugefile_path(path, buflen, hi->hugedir,
- list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
-
- fd = fd_list[list_idx].fds[seg_idx];
-
- if (fd < 0) {
- fd = open(path, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
- __func__, strerror(errno));
- return -1;
- }
- /* take out a read lock */
- if (lock(fd, LOCK_SH) < 0) {
- RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
- __func__, strerror(errno));
- close(fd);
- return -1;
- }
- fd_list[list_idx].fds[seg_idx] = fd;
- }
- }
- return fd;
-}
-
-static int
-resize_hugefile_in_memory(int fd, uint64_t fa_offset,
- uint64_t page_sz, bool grow)
-{
- int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_KEEP_SIZE;
- int ret;
-
- /* grow or shrink the file */
- ret = fallocate(fd, flags, fa_offset, page_sz);
-
- if (ret < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
- __func__,
- strerror(errno));
- return -1;
- }
- return 0;
-}
-
-static int
-resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
- bool grow)
-{
- bool again = false;
-
- do {
- if (fallocate_supported == 0) {
- /* we cannot deallocate memory if fallocate() is not
- * supported, and hugepage file is already locked at
- * creation, so no further synchronization needed.
- */
-
- if (!grow) {
- RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
- __func__);
- return -1;
- }
- uint64_t new_size = fa_offset + page_sz;
- uint64_t cur_size = get_file_size(fd);
-
- /* fallocate isn't supported, fall back to ftruncate */
- if (new_size > cur_size &&
- ftruncate(fd, new_size) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
- __func__, strerror(errno));
- return -1;
- }
- } else {
- int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_KEEP_SIZE;
- int ret;
-
- /*
- * technically, it is perfectly safe for both primary
- * and secondary to grow and shrink the page files:
- * growing the file repeatedly has no effect because
- * a page can only be allocated once, while mmap ensures
- * that secondaries hold on to the page even after the
- * page itself is removed from the filesystem.
- *
- * however, leaving growing/shrinking to the primary
- * tends to expose bugs in fdlist page count handling,
- * so leave this here just in case.
- */
- if (rte_eal_process_type() != RTE_PROC_PRIMARY)
- return 0;
-
- /* grow or shrink the file */
- ret = fallocate(fd, flags, fa_offset, page_sz);
-
- if (ret < 0) {
- if (fallocate_supported == -1 &&
- errno == ENOTSUP) {
- RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
- __func__);
- again = true;
- fallocate_supported = 0;
- } else {
- RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
- __func__,
- strerror(errno));
- return -1;
- }
- } else
- fallocate_supported = 1;
- }
- } while (again);
-
- return 0;
-}
-
-static void
-close_hugefile(int fd, char *path, int list_idx)
-{
- /*
- * primary process must unlink the file, but only when not in in-memory
- * mode (as in that case there is no file to unlink).
- */
- if (!internal_config.in_memory &&
- rte_eal_process_type() == RTE_PROC_PRIMARY &&
- unlink(path))
- RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
- __func__, path, strerror(errno));
-
- close(fd);
- fd_list[list_idx].memseg_list_fd = -1;
-}
-
-static int
-resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow)
-{
- /* in-memory mode is a special case, because we can be sure that
- * fallocate() is supported.
- */
- if (internal_config.in_memory)
- return resize_hugefile_in_memory(fd, fa_offset,
- page_sz, grow);
-
- return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
- grow);
-}
-
-static int
-alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
- struct hugepage_info *hi, unsigned int list_idx,
- unsigned int seg_idx)
-{
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- int cur_socket_id = 0;
-#endif
- uint64_t map_offset;
- rte_iova_t iova;
- void *va;
- char path[PATH_MAX];
- int ret = 0;
- int fd;
- size_t alloc_sz;
- int flags;
- void *new_addr;
-
- alloc_sz = hi->hugepage_sz;
-
- /* these are checked at init, but code analyzers don't know that */
- if (internal_config.in_memory && !anonymous_hugepages_supported) {
- RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
- return -1;
- }
- if (internal_config.in_memory && !memfd_create_supported &&
- internal_config.single_file_segments) {
- RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
- return -1;
- }
-
- /* in-memory without memfd is a special case */
- int mmap_flags;
-
- if (internal_config.in_memory && !memfd_create_supported) {
- const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
- MAP_PRIVATE | MAP_ANONYMOUS;
- int pagesz_flag;
-
- pagesz_flag = pagesz_flags(alloc_sz);
- fd = -1;
- mmap_flags = in_memory_flags | pagesz_flag;
-
- /* single-file segments codepath will never be active
- * here because in-memory mode is incompatible with the
- * fallback path, and it's stopped at EAL initialization
- * stage.
- */
- map_offset = 0;
- } else {
- /* takes out a read lock on segment or segment list */
- fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
- return -1;
- }
-
- if (internal_config.single_file_segments) {
- map_offset = seg_idx * alloc_sz;
- ret = resize_hugefile(fd, map_offset, alloc_sz, true);
- if (ret < 0)
- goto resized;
-
- fd_list[list_idx].count++;
- } else {
- map_offset = 0;
- if (ftruncate(fd, alloc_sz) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
- __func__, strerror(errno));
- goto resized;
- }
- if (internal_config.hugepage_unlink &&
- !internal_config.in_memory) {
- if (unlink(path)) {
- RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
- __func__, strerror(errno));
- goto resized;
- }
- }
- }
- mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
- }
-
- /*
- * map the segment, and populate page tables, the kernel fills
- * this segment with zeros if it's a new page.
- */
- va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
- map_offset);
-
- if (va == MAP_FAILED) {
- RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
- strerror(errno));
- /* mmap failed, but the previous region might have been
- * unmapped anyway. try to remap it
- */
- goto unmapped;
- }
- if (va != addr) {
- RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
- munmap(va, alloc_sz);
- goto resized;
- }
-
- /* In linux, hugetlb limitations, like cgroup, are
- * enforced at fault time instead of mmap(), even
- * with the option of MAP_POPULATE. Kernel will send
- * a SIGBUS signal. To avoid to be killed, save stack
- * environment here, if SIGBUS happens, we can jump
- * back here.
- */
- if (huge_wrap_sigsetjmp()) {
- RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
- (unsigned int)(alloc_sz >> 20));
- goto mapped;
- }
-
- /* we need to trigger a write to the page to enforce page fault and
- * ensure that page is accessible to us, but we can't overwrite value
- * that is already there, so read the old value, and write itback.
- * kernel populates the page with zeroes initially.
- */
- *(volatile int *)addr = *(volatile int *)addr;
-
- iova = rte_mem_virt2iova(addr);
- if (iova == RTE_BAD_PHYS_ADDR) {
- RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
- __func__);
- goto mapped;
- }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
- MPOL_F_NODE | MPOL_F_ADDR);
- if (ret < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n",
- __func__, strerror(errno));
- goto mapped;
- } else if (cur_socket_id != socket_id) {
- RTE_LOG(DEBUG, EAL,
- "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
- __func__, socket_id, cur_socket_id);
- goto mapped;
- }
-#else
- if (rte_socket_count() > 1)
- RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n",
- __func__);
-#endif
-
- ms->addr = addr;
- ms->hugepage_sz = alloc_sz;
- ms->len = alloc_sz;
- ms->nchannel = rte_memory_get_nchannel();
- ms->nrank = rte_memory_get_nrank();
- ms->iova = iova;
- ms->socket_id = socket_id;
-
- return 0;
-
-mapped:
- munmap(addr, alloc_sz);
-unmapped:
- flags = MAP_FIXED;
- new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
- if (new_addr != addr) {
- if (new_addr != NULL)
- munmap(new_addr, alloc_sz);
- /* we're leaving a hole in our virtual address space. if
- * somebody else maps this hole now, we could accidentally
- * override it in the future.
- */
- RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
- }
- /* roll back the ref count */
- if (internal_config.single_file_segments)
- fd_list[list_idx].count--;
-resized:
- /* some codepaths will return negative fd, so exit early */
- if (fd < 0)
- return -1;
-
- if (internal_config.single_file_segments) {
- resize_hugefile(fd, map_offset, alloc_sz, false);
- /* ignore failure, can't make it any worse */
-
- /* if refcount is at zero, close the file */
- if (fd_list[list_idx].count == 0)
- close_hugefile(fd, path, list_idx);
- } else {
- /* only remove file if we can take out a write lock */
- if (internal_config.hugepage_unlink == 0 &&
- internal_config.in_memory == 0 &&
- lock(fd, LOCK_EX) == 1)
- unlink(path);
- close(fd);
- fd_list[list_idx].fds[seg_idx] = -1;
- }
- return -1;
-}
-
-static int
-free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
- unsigned int list_idx, unsigned int seg_idx)
-{
- uint64_t map_offset;
- char path[PATH_MAX];
- int fd, ret = 0;
- bool exit_early;
-
- /* erase page data */
- memset(ms->addr, 0, ms->len);
-
- if (mmap(ms->addr, ms->len, PROT_READ,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
- MAP_FAILED) {
- RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
- return -1;
- }
-
- exit_early = false;
-
- /* if we're using anonymous hugepages, nothing to be done */
- if (internal_config.in_memory && !memfd_create_supported)
- exit_early = true;
-
- /* if we've already unlinked the page, nothing needs to be done */
- if (!internal_config.in_memory && internal_config.hugepage_unlink)
- exit_early = true;
-
- if (exit_early) {
- memset(ms, 0, sizeof(*ms));
- return 0;
- }
-
- /* if we are not in single file segments mode, we're going to unmap the
- * segment and thus drop the lock on original fd, but hugepage dir is
- * now locked so we can take out another one without races.
- */
- fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
- if (fd < 0)
- return -1;
-
- if (internal_config.single_file_segments) {
- map_offset = seg_idx * ms->len;
- if (resize_hugefile(fd, map_offset, ms->len, false))
- return -1;
-
- if (--(fd_list[list_idx].count) == 0)
- close_hugefile(fd, path, list_idx);
-
- ret = 0;
- } else {
- /* if we're able to take out a write lock, we're the last one
- * holding onto this page.
- */
- if (!internal_config.in_memory) {
- ret = lock(fd, LOCK_EX);
- if (ret >= 0) {
- /* no one else is using this page */
- if (ret == 1)
- unlink(path);
- }
- }
- /* closing fd will drop the lock */
- close(fd);
- fd_list[list_idx].fds[seg_idx] = -1;
- }
-
- memset(ms, 0, sizeof(*ms));
-
- return ret < 0 ? -1 : 0;
-}
-
-struct alloc_walk_param {
- struct hugepage_info *hi;
- struct rte_memseg **ms;
- size_t page_sz;
- unsigned int segs_allocated;
- unsigned int n_segs;
- int socket;
- bool exact;
-};
-static int
-alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct alloc_walk_param *wa = arg;
- struct rte_memseg_list *cur_msl;
- size_t page_sz;
- int cur_idx, start_idx, j, dir_fd = -1;
- unsigned int msl_idx, need, i;
-
- if (msl->page_sz != wa->page_sz)
- return 0;
- if (msl->socket_id != wa->socket)
- return 0;
-
- page_sz = (size_t)msl->page_sz;
-
- msl_idx = msl - mcfg->memsegs;
- cur_msl = &mcfg->memsegs[msl_idx];
-
- need = wa->n_segs;
-
- /* try finding space in memseg list */
- if (wa->exact) {
- /* if we require exact number of pages in a list, find them */
- cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
- need);
- if (cur_idx < 0)
- return 0;
- start_idx = cur_idx;
- } else {
- int cur_len;
-
- /* we don't require exact number of pages, so we're going to go
- * for best-effort allocation. that means finding the biggest
- * unused block, and going with that.
- */
- cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
- 0);
- if (cur_idx < 0)
- return 0;
- start_idx = cur_idx;
- /* adjust the size to possibly be smaller than original
- * request, but do not allow it to be bigger.
- */
- cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
- cur_idx);
- need = RTE_MIN(need, (unsigned int)cur_len);
- }
-
- /* do not allow any page allocations during the time we're allocating,
- * because file creation and locking operations are not atomic,
- * and we might be the first or the last ones to use a particular page,
- * so we need to ensure atomicity of every operation.
- *
- * during init, we already hold a write lock, so don't try to take out
- * another one.
- */
- if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
- dir_fd = open(wa->hi->hugedir, O_RDONLY);
- if (dir_fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
- __func__, wa->hi->hugedir, strerror(errno));
- return -1;
- }
- /* blocking writelock */
- if (flock(dir_fd, LOCK_EX)) {
- RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
- __func__, wa->hi->hugedir, strerror(errno));
- close(dir_fd);
- return -1;
- }
- }
-
- for (i = 0; i < need; i++, cur_idx++) {
- struct rte_memseg *cur;
- void *map_addr;
-
- cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
- map_addr = RTE_PTR_ADD(cur_msl->base_va,
- cur_idx * page_sz);
-
- if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
- msl_idx, cur_idx)) {
- RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
- need, i);
-
- /* if exact number wasn't requested, stop */
- if (!wa->exact)
- goto out;
-
- /* clean up */
- for (j = start_idx; j < cur_idx; j++) {
- struct rte_memseg *tmp;
- struct rte_fbarray *arr =
- &cur_msl->memseg_arr;
-
- tmp = rte_fbarray_get(arr, j);
- rte_fbarray_set_free(arr, j);
-
- /* free_seg may attempt to create a file, which
- * may fail.
- */
- if (free_seg(tmp, wa->hi, msl_idx, j))
- RTE_LOG(DEBUG, EAL, "Cannot free page\n");
- }
- /* clear the list */
- if (wa->ms)
- memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
-
- if (dir_fd >= 0)
- close(dir_fd);
- return -1;
- }
- if (wa->ms)
- wa->ms[i] = cur;
-
- rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
- }
-out:
- wa->segs_allocated = i;
- if (i > 0)
- cur_msl->version++;
- if (dir_fd >= 0)
- close(dir_fd);
- /* if we didn't allocate any segments, move on to the next list */
- return i > 0;
-}
-
-struct free_walk_param {
- struct hugepage_info *hi;
- struct rte_memseg *ms;
-};
-static int
-free_seg_walk(const struct rte_memseg_list *msl, void *arg)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct rte_memseg_list *found_msl;
- struct free_walk_param *wa = arg;
- uintptr_t start_addr, end_addr;
- int msl_idx, seg_idx, ret, dir_fd = -1;
-
- start_addr = (uintptr_t) msl->base_va;
- end_addr = start_addr + msl->len;
-
- if ((uintptr_t)wa->ms->addr < start_addr ||
- (uintptr_t)wa->ms->addr >= end_addr)
- return 0;
-
- msl_idx = msl - mcfg->memsegs;
- seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
-
- /* msl is const */
- found_msl = &mcfg->memsegs[msl_idx];
-
- /* do not allow any page allocations during the time we're freeing,
- * because file creation and locking operations are not atomic,
- * and we might be the first or the last ones to use a particular page,
- * so we need to ensure atomicity of every operation.
- *
- * during init, we already hold a write lock, so don't try to take out
- * another one.
- */
- if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
- dir_fd = open(wa->hi->hugedir, O_RDONLY);
- if (dir_fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
- __func__, wa->hi->hugedir, strerror(errno));
- return -1;
- }
- /* blocking writelock */
- if (flock(dir_fd, LOCK_EX)) {
- RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
- __func__, wa->hi->hugedir, strerror(errno));
- close(dir_fd);
- return -1;
- }
- }
-
- found_msl->version++;
-
- rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
-
- ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
-
- if (dir_fd >= 0)
- close(dir_fd);
-
- if (ret < 0)
- return -1;
-
- return 1;
-}
-
-int
-eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
- int socket, bool exact)
-{
- int i, ret = -1;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- bool have_numa = false;
- int oldpolicy;
- struct bitmask *oldmask;
-#endif
- struct alloc_walk_param wa;
- struct hugepage_info *hi = NULL;
-
- memset(&wa, 0, sizeof(wa));
-
- /* dynamic allocation not supported in legacy mode */
- if (internal_config.legacy_mem)
- return -1;
-
- for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
- if (page_sz ==
- internal_config.hugepage_info[i].hugepage_sz) {
- hi = &internal_config.hugepage_info[i];
- break;
- }
- }
- if (!hi) {
- RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
- __func__);
- return -1;
- }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (check_numa()) {
- oldmask = numa_allocate_nodemask();
- prepare_numa(&oldpolicy, oldmask, socket);
- have_numa = true;
- }
-#endif
-
- wa.exact = exact;
- wa.hi = hi;
- wa.ms = ms;
- wa.n_segs = n_segs;
- wa.page_sz = page_sz;
- wa.socket = socket;
- wa.segs_allocated = 0;
-
- /* memalloc is locked, so it's safe to use thread-unsafe version */
- ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
- if (ret == 0) {
- RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
- __func__);
- ret = -1;
- } else if (ret > 0) {
- ret = (int)wa.segs_allocated;
- }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (have_numa)
- restore_numa(&oldpolicy, oldmask);
-#endif
- return ret;
-}
-
-struct rte_memseg *
-eal_memalloc_alloc_seg(size_t page_sz, int socket)
-{
- struct rte_memseg *ms;
- if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
- return NULL;
- /* return pointer to newly allocated memseg */
- return ms;
-}
-
-int
-eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
-{
- int seg, ret = 0;
-
- /* dynamic free not supported in legacy mode */
- if (internal_config.legacy_mem)
- return -1;
-
- for (seg = 0; seg < n_segs; seg++) {
- struct rte_memseg *cur = ms[seg];
- struct hugepage_info *hi = NULL;
- struct free_walk_param wa;
- int i, walk_res;
-
- /* if this page is marked as unfreeable, fail */
- if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
- RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
- ret = -1;
- continue;
- }
-
- memset(&wa, 0, sizeof(wa));
-
- for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
- i++) {
- hi = &internal_config.hugepage_info[i];
- if (cur->hugepage_sz == hi->hugepage_sz)
- break;
- }
- if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
- RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
- ret = -1;
- continue;
- }
-
- wa.ms = cur;
- wa.hi = hi;
-
- /* memalloc is locked, so it's safe to use thread-unsafe version
- */
- walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
- &wa);
- if (walk_res == 1)
- continue;
- if (walk_res == 0)
- RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
- ret = -1;
- }
- return ret;
-}
-
-int
-eal_memalloc_free_seg(struct rte_memseg *ms)
-{
- /* dynamic free not supported in legacy mode */
- if (internal_config.legacy_mem)
- return -1;
-
- return eal_memalloc_free_seg_bulk(&ms, 1);
-}
-
-static int
-sync_chunk(struct rte_memseg_list *primary_msl,
- struct rte_memseg_list *local_msl, struct hugepage_info *hi,
- unsigned int msl_idx, bool used, int start, int end)
-{
- struct rte_fbarray *l_arr, *p_arr;
- int i, ret, chunk_len, diff_len;
-
- l_arr = &local_msl->memseg_arr;
- p_arr = &primary_msl->memseg_arr;
-
- /* we need to aggregate allocations/deallocations into bigger chunks,
- * as we don't want to spam the user with per-page callbacks.
- *
- * to avoid any potential issues, we also want to trigger
- * deallocation callbacks *before* we actually deallocate
- * memory, so that the user application could wrap up its use
- * before it goes away.
- */
-
- chunk_len = end - start;
-
- /* find how many contiguous pages we can map/unmap for this chunk */
- diff_len = used ?
- rte_fbarray_find_contig_free(l_arr, start) :
- rte_fbarray_find_contig_used(l_arr, start);
-
- /* has to be at least one page */
- if (diff_len < 1)
- return -1;
-
- diff_len = RTE_MIN(chunk_len, diff_len);
-
- /* if we are freeing memory, notify the application */
- if (!used) {
- struct rte_memseg *ms;
- void *start_va;
- size_t len, page_sz;
-
- ms = rte_fbarray_get(l_arr, start);
- start_va = ms->addr;
- page_sz = (size_t)primary_msl->page_sz;
- len = page_sz * diff_len;
-
- eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
- start_va, len);
- }
-
- for (i = 0; i < diff_len; i++) {
- struct rte_memseg *p_ms, *l_ms;
- int seg_idx = start + i;
-
- l_ms = rte_fbarray_get(l_arr, seg_idx);
- p_ms = rte_fbarray_get(p_arr, seg_idx);
-
- if (l_ms == NULL || p_ms == NULL)
- return -1;
-
- if (used) {
- ret = alloc_seg(l_ms, p_ms->addr,
- p_ms->socket_id, hi,
- msl_idx, seg_idx);
- if (ret < 0)
- return -1;
- rte_fbarray_set_used(l_arr, seg_idx);
- } else {
- ret = free_seg(l_ms, hi, msl_idx, seg_idx);
- rte_fbarray_set_free(l_arr, seg_idx);
- if (ret < 0)
- return -1;
- }
- }
-
- /* if we just allocated memory, notify the application */
- if (used) {
- struct rte_memseg *ms;
- void *start_va;
- size_t len, page_sz;
-
- ms = rte_fbarray_get(l_arr, start);
- start_va = ms->addr;
- page_sz = (size_t)primary_msl->page_sz;
- len = page_sz * diff_len;
-
- eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
- start_va, len);
- }
-
- /* calculate how much we can advance until next chunk */
- diff_len = used ?
- rte_fbarray_find_contig_used(l_arr, start) :
- rte_fbarray_find_contig_free(l_arr, start);
- ret = RTE_MIN(chunk_len, diff_len);
-
- return ret;
-}
-
-static int
-sync_status(struct rte_memseg_list *primary_msl,
- struct rte_memseg_list *local_msl, struct hugepage_info *hi,
- unsigned int msl_idx, bool used)
-{
- struct rte_fbarray *l_arr, *p_arr;
- int p_idx, l_chunk_len, p_chunk_len, ret;
- int start, end;
-
- /* this is a little bit tricky, but the basic idea is - walk both lists
- * and spot any places where there are discrepancies. walking both lists
- * and noting discrepancies in a single go is a hard problem, so we do
- * it in two passes - first we spot any places where allocated segments
- * mismatch (i.e. ensure that everything that's allocated in the primary
- * is also allocated in the secondary), and then we do it by looking at
- * free segments instead.
- *
- * we also need to aggregate changes into chunks, as we have to call
- * callbacks per allocation, not per page.
- */
- l_arr = &local_msl->memseg_arr;
- p_arr = &primary_msl->memseg_arr;
-
- if (used)
- p_idx = rte_fbarray_find_next_used(p_arr, 0);
- else
- p_idx = rte_fbarray_find_next_free(p_arr, 0);
-
- while (p_idx >= 0) {
- int next_chunk_search_idx;
-
- if (used) {
- p_chunk_len = rte_fbarray_find_contig_used(p_arr,
- p_idx);
- l_chunk_len = rte_fbarray_find_contig_used(l_arr,
- p_idx);
- } else {
- p_chunk_len = rte_fbarray_find_contig_free(p_arr,
- p_idx);
- l_chunk_len = rte_fbarray_find_contig_free(l_arr,
- p_idx);
- }
- /* best case scenario - no differences (or bigger, which will be
- * fixed during next iteration), look for next chunk
- */
- if (l_chunk_len >= p_chunk_len) {
- next_chunk_search_idx = p_idx + p_chunk_len;
- goto next_chunk;
- }
-
- /* if both chunks start at the same point, skip parts we know
- * are identical, and sync the rest. each call to sync_chunk
- * will only sync contiguous segments, so we need to call this
- * until we are sure there are no more differences in this
- * chunk.
- */
- start = p_idx + l_chunk_len;
- end = p_idx + p_chunk_len;
- do {
- ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
- used, start, end);
- start += ret;
- } while (start < end && ret >= 0);
- /* if ret is negative, something went wrong */
- if (ret < 0)
- return -1;
-
- next_chunk_search_idx = p_idx + p_chunk_len;
-next_chunk:
- /* skip to end of this chunk */
- if (used) {
- p_idx = rte_fbarray_find_next_used(p_arr,
- next_chunk_search_idx);
- } else {
- p_idx = rte_fbarray_find_next_free(p_arr,
- next_chunk_search_idx);
- }
- }
- return 0;
-}
-
-static int
-sync_existing(struct rte_memseg_list *primary_msl,
- struct rte_memseg_list *local_msl, struct hugepage_info *hi,
- unsigned int msl_idx)
-{
- int ret, dir_fd;
-
- /* do not allow any page allocations during the time we're allocating,
- * because file creation and locking operations are not atomic,
- * and we might be the first or the last ones to use a particular page,
- * so we need to ensure atomicity of every operation.
- */
- dir_fd = open(hi->hugedir, O_RDONLY);
- if (dir_fd < 0) {
- RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
- hi->hugedir, strerror(errno));
- return -1;
- }
- /* blocking writelock */
- if (flock(dir_fd, LOCK_EX)) {
- RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
- hi->hugedir, strerror(errno));
- close(dir_fd);
- return -1;
- }
-
- /* ensure all allocated space is the same in both lists */
- ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
- if (ret < 0)
- goto fail;
-
- /* ensure all unallocated space is the same in both lists */
- ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
- if (ret < 0)
- goto fail;
-
- /* update version number */
- local_msl->version = primary_msl->version;
-
- close(dir_fd);
-
- return 0;
-fail:
- close(dir_fd);
- return -1;
-}
-
-static int
-sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct rte_memseg_list *primary_msl, *local_msl;
- struct hugepage_info *hi = NULL;
- unsigned int i;
- int msl_idx;
-
- if (msl->external)
- return 0;
-
- msl_idx = msl - mcfg->memsegs;
- primary_msl = &mcfg->memsegs[msl_idx];
- local_msl = &local_memsegs[msl_idx];
-
- for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
- uint64_t cur_sz =
- internal_config.hugepage_info[i].hugepage_sz;
- uint64_t msl_sz = primary_msl->page_sz;
- if (msl_sz == cur_sz) {
- hi = &internal_config.hugepage_info[i];
- break;
- }
- }
- if (!hi) {
- RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
- return -1;
- }
-
- /* if versions don't match, synchronize everything */
- if (local_msl->version != primary_msl->version &&
- sync_existing(primary_msl, local_msl, hi, msl_idx))
- return -1;
- return 0;
-}
-
-
-int
-eal_memalloc_sync_with_primary(void)
-{
- /* nothing to be done in primary */
- if (rte_eal_process_type() == RTE_PROC_PRIMARY)
- return 0;
-
- /* memalloc is locked, so it's safe to call thread-unsafe version */
- if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
- return -1;
- return 0;
-}
-
-static int
-secondary_msl_create_walk(const struct rte_memseg_list *msl,
- void *arg __rte_unused)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct rte_memseg_list *primary_msl, *local_msl;
- char name[PATH_MAX];
- int msl_idx, ret;
-
- if (msl->external)
- return 0;
-
- msl_idx = msl - mcfg->memsegs;
- primary_msl = &mcfg->memsegs[msl_idx];
- local_msl = &local_memsegs[msl_idx];
-
- /* create distinct fbarrays for each secondary */
- snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
- primary_msl->memseg_arr.name, getpid());
-
- ret = rte_fbarray_init(&local_msl->memseg_arr, name,
- primary_msl->memseg_arr.len,
- primary_msl->memseg_arr.elt_sz);
- if (ret < 0) {
- RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
- return -1;
- }
- local_msl->base_va = primary_msl->base_va;
- local_msl->len = primary_msl->len;
-
- return 0;
-}
-
-static int
-alloc_list(int list_idx, int len)
-{
- int *data;
- int i;
-
- /* single-file segments mode does not need fd list */
- if (!internal_config.single_file_segments) {
- /* ensure we have space to store fd per each possible segment */
- data = malloc(sizeof(int) * len);
- if (data == NULL) {
- RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
- return -1;
- }
- /* set all fd's as invalid */
- for (i = 0; i < len; i++)
- data[i] = -1;
- fd_list[list_idx].fds = data;
- fd_list[list_idx].len = len;
- } else {
- fd_list[list_idx].fds = NULL;
- fd_list[list_idx].len = 0;
- }
-
- fd_list[list_idx].count = 0;
- fd_list[list_idx].memseg_list_fd = -1;
-
- return 0;
-}
-
-static int
-fd_list_create_walk(const struct rte_memseg_list *msl,
- void *arg __rte_unused)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned int len;
- int msl_idx;
-
- if (msl->external)
- return 0;
-
- msl_idx = msl - mcfg->memsegs;
- len = msl->memseg_arr.len;
-
- return alloc_list(msl_idx, len);
-}
-
-int
-eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-
- /* single file segments mode doesn't support individual segment fd's */
- if (internal_config.single_file_segments)
- return -ENOTSUP;
-
- /* if list is not allocated, allocate it */
- if (fd_list[list_idx].len == 0) {
- int len = mcfg->memsegs[list_idx].memseg_arr.len;
-
- if (alloc_list(list_idx, len) < 0)
- return -ENOMEM;
- }
- fd_list[list_idx].fds[seg_idx] = fd;
-
- return 0;
-}
-
-int
-eal_memalloc_set_seg_list_fd(int list_idx, int fd)
-{
- /* non-single file segment mode doesn't support segment list fd's */
- if (!internal_config.single_file_segments)
- return -ENOTSUP;
-
- fd_list[list_idx].memseg_list_fd = fd;
-
- return 0;
-}
-
-int
-eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
-{
- int fd;
-
- if (internal_config.in_memory || internal_config.no_hugetlbfs) {
-#ifndef MEMFD_SUPPORTED
- /* in in-memory or no-huge mode, we rely on memfd support */
- return -ENOTSUP;
-#endif
- /* memfd supported, but hugetlbfs memfd may not be */
- if (!internal_config.no_hugetlbfs && !memfd_create_supported)
- return -ENOTSUP;
- }
-
- if (internal_config.single_file_segments) {
- fd = fd_list[list_idx].memseg_list_fd;
- } else if (fd_list[list_idx].len == 0) {
- /* list not initialized */
- fd = -1;
- } else {
- fd = fd_list[list_idx].fds[seg_idx];
- }
- if (fd < 0)
- return -ENODEV;
- return fd;
-}
-
-static int
-test_memfd_create(void)
-{
-#ifdef MEMFD_SUPPORTED
- unsigned int i;
- for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
- uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
- int pagesz_flag = pagesz_flags(pagesz);
- int flags;
-
- flags = pagesz_flag | RTE_MFD_HUGETLB;
- int fd = memfd_create("test", flags);
- if (fd < 0) {
- /* we failed - let memalloc know this isn't working */
- if (errno == EINVAL) {
- memfd_create_supported = 0;
- return 0; /* not supported */
- }
-
- /* we got other error - something's wrong */
- return -1; /* error */
- }
- close(fd);
- return 1; /* supported */
- }
-#endif
- return 0; /* not supported */
-}
-
-int
-eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-
- if (internal_config.in_memory || internal_config.no_hugetlbfs) {
-#ifndef MEMFD_SUPPORTED
- /* in in-memory or no-huge mode, we rely on memfd support */
- return -ENOTSUP;
-#endif
- /* memfd supported, but hugetlbfs memfd may not be */
- if (!internal_config.no_hugetlbfs && !memfd_create_supported)
- return -ENOTSUP;
- }
-
- if (internal_config.single_file_segments) {
- size_t pgsz = mcfg->memsegs[list_idx].page_sz;
-
- /* segment not active? */
- if (fd_list[list_idx].memseg_list_fd < 0)
- return -ENOENT;
- *offset = pgsz * seg_idx;
- } else {
- /* fd_list not initialized? */
- if (fd_list[list_idx].len == 0)
- return -ENODEV;
-
- /* segment not active? */
- if (fd_list[list_idx].fds[seg_idx] < 0)
- return -ENOENT;
- *offset = 0;
- }
- return 0;
-}
-
-int
-eal_memalloc_init(void)
-{
- if (rte_eal_process_type() == RTE_PROC_SECONDARY)
- if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
- return -1;
- if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
- internal_config.in_memory) {
- int mfd_res = test_memfd_create();
-
- if (mfd_res < 0) {
- RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
- return -1;
- }
- if (mfd_res == 1)
- RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
- else
- RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
-
- /* we only support single-file segments mode with in-memory mode
- * if we support hugetlbfs with memfd_create. this code will
- * test if we do.
- */
- if (internal_config.single_file_segments &&
- mfd_res != 1) {
- RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
- return -1;
- }
- /* this cannot ever happen but better safe than sorry */
- if (!anonymous_hugepages_supported) {
- RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
- return -1;
- }
- }
-
- /* initialize all of the fd lists */
- if (rte_memseg_list_walk(fd_list_create_walk, NULL))
- return -1;
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright(c) 2013 6WIND S.A.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/queue.h>
-#include <sys/file.h>
-#include <sys/resource.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <signal.h>
-#include <setjmp.h>
-#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
-#include <linux/memfd.h>
-#define MEMFD_SUPPORTED
-#endif
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-#include <numa.h>
-#include <numaif.h>
-#endif
-
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_common.h>
-#include <rte_string_fns.h>
-
-#include "eal_private.h"
-#include "eal_memalloc.h"
-#include "eal_memcfg.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_hugepages.h"
-#include "eal_options.h"
-
-#define PFN_MASK_SIZE 8
-
-/**
- * @file
- * Huge page mapping under linux
- *
- * To reserve a big contiguous amount of memory, we use the hugepage
- * feature of linux. For that, we need to have hugetlbfs mounted. This
- * code will create many files in this directory (one per page) and
- * map them in virtual memory. For each page, we will retrieve its
- * physical address and remap it in order to have a virtual contiguous
- * zone as well as a physical contiguous zone.
- */
-
-static int phys_addrs_available = -1;
-
-#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
-
-uint64_t eal_get_baseaddr(void)
-{
- /*
- * Linux kernel uses a really high address as starting address for
- * serving mmaps calls. If there exists addressing limitations and IOVA
- * mode is VA, this starting address is likely too high for those
- * devices. However, it is possible to use a lower address in the
- * process virtual address space as with 64 bits there is a lot of
- * available space.
- *
- * Current known limitations are 39 or 40 bits. Setting the starting
- * address at 4GB implies there are 508GB or 1020GB for mapping the
- * available hugepages. This is likely enough for most systems, although
- * a device with addressing limitations should call
- * rte_mem_check_dma_mask for ensuring all memory is within supported
- * range.
- */
- return 0x100000000ULL;
-}
-
-/*
- * Get physical address of any mapped virtual address in the current process.
- */
-phys_addr_t
-rte_mem_virt2phy(const void *virtaddr)
-{
- int fd, retval;
- uint64_t page, physaddr;
- unsigned long virt_pfn;
- int page_size;
- off_t offset;
-
- if (phys_addrs_available == 0)
- return RTE_BAD_IOVA;
-
- /* standard page size */
- page_size = getpagesize();
-
- fd = open("/proc/self/pagemap", O_RDONLY);
- if (fd < 0) {
- RTE_LOG(INFO, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
- __func__, strerror(errno));
- return RTE_BAD_IOVA;
- }
-
- virt_pfn = (unsigned long)virtaddr / page_size;
- offset = sizeof(uint64_t) * virt_pfn;
- if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
- RTE_LOG(INFO, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
- __func__, strerror(errno));
- close(fd);
- return RTE_BAD_IOVA;
- }
-
- retval = read(fd, &page, PFN_MASK_SIZE);
- close(fd);
- if (retval < 0) {
- RTE_LOG(INFO, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
- __func__, strerror(errno));
- return RTE_BAD_IOVA;
- } else if (retval != PFN_MASK_SIZE) {
- RTE_LOG(INFO, EAL, "%s(): read %d bytes from /proc/self/pagemap "
- "but expected %d:\n",
- __func__, retval, PFN_MASK_SIZE);
- return RTE_BAD_IOVA;
- }
-
- /*
- * the pfn (page frame number) are bits 0-54 (see
- * pagemap.txt in linux Documentation)
- */
- if ((page & 0x7fffffffffffffULL) == 0)
- return RTE_BAD_IOVA;
-
- physaddr = ((page & 0x7fffffffffffffULL) * page_size)
- + ((unsigned long)virtaddr % page_size);
-
- return physaddr;
-}
-
-rte_iova_t
-rte_mem_virt2iova(const void *virtaddr)
-{
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- return (uintptr_t)virtaddr;
- return rte_mem_virt2phy(virtaddr);
-}
-
-/*
- * For each hugepage in hugepg_tbl, fill the physaddr value. We find
- * it by browsing the /proc/self/pagemap special file.
- */
-static int
-find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
- unsigned int i;
- phys_addr_t addr;
-
- for (i = 0; i < hpi->num_pages[0]; i++) {
- addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
- if (addr == RTE_BAD_PHYS_ADDR)
- return -1;
- hugepg_tbl[i].physaddr = addr;
- }
- return 0;
-}
-
-/*
- * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
- */
-static int
-set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
- unsigned int i;
- static phys_addr_t addr;
-
- for (i = 0; i < hpi->num_pages[0]; i++) {
- hugepg_tbl[i].physaddr = addr;
- addr += hugepg_tbl[i].size;
- }
- return 0;
-}
-
-/*
- * Check whether address-space layout randomization is enabled in
- * the kernel. This is important for multi-process as it can prevent
- * two processes mapping data to the same virtual address
- * Returns:
- * 0 - address space randomization disabled
- * 1/2 - address space randomization enabled
- * negative error code on error
- */
-static int
-aslr_enabled(void)
-{
- char c;
- int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
- if (fd < 0)
- return -errno;
- retval = read(fd, &c, 1);
- close(fd);
- if (retval < 0)
- return -errno;
- if (retval == 0)
- return -EIO;
- switch (c) {
- case '0' : return 0;
- case '1' : return 1;
- case '2' : return 2;
- default: return -EINVAL;
- }
-}
-
-static sigjmp_buf huge_jmpenv;
-
-static void huge_sigbus_handler(int signo __rte_unused)
-{
- siglongjmp(huge_jmpenv, 1);
-}
-
-/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
- * non-static local variable in the stack frame calling sigsetjmp might be
- * clobbered by a call to longjmp.
- */
-static int huge_wrap_sigsetjmp(void)
-{
- return sigsetjmp(huge_jmpenv, 1);
-}
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-/* Callback for numa library. */
-void numa_error(char *where)
-{
- RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
-}
-#endif
-
-/*
- * Mmap all hugepages of hugepage table: it first open a file in
- * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
- * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
- * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
- * map contiguous physical blocks in contiguous virtual blocks.
- */
-static unsigned
-map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
- uint64_t *essential_memory __rte_unused)
-{
- int fd;
- unsigned i;
- void *virtaddr;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- int node_id = -1;
- int essential_prev = 0;
- int oldpolicy;
- struct bitmask *oldmask = NULL;
- bool have_numa = true;
- unsigned long maxnode = 0;
-
- /* Check if kernel supports NUMA. */
- if (numa_available() != 0) {
- RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
- have_numa = false;
- }
-
- if (have_numa) {
- RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
- oldmask = numa_allocate_nodemask();
- if (get_mempolicy(&oldpolicy, oldmask->maskp,
- oldmask->size + 1, 0, 0) < 0) {
- RTE_LOG(ERR, EAL,
- "Failed to get current mempolicy: %s. "
- "Assuming MPOL_DEFAULT.\n", strerror(errno));
- oldpolicy = MPOL_DEFAULT;
- }
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
- if (internal_config.socket_mem[i])
- maxnode = i + 1;
- }
-#endif
-
- for (i = 0; i < hpi->num_pages[0]; i++) {
- struct hugepage_file *hf = &hugepg_tbl[i];
- uint64_t hugepage_sz = hpi->hugepage_sz;
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (maxnode) {
- unsigned int j;
-
- for (j = 0; j < maxnode; j++)
- if (essential_memory[j])
- break;
-
- if (j == maxnode) {
- node_id = (node_id + 1) % maxnode;
- while (!internal_config.socket_mem[node_id]) {
- node_id++;
- node_id %= maxnode;
- }
- essential_prev = 0;
- } else {
- node_id = j;
- essential_prev = essential_memory[j];
-
- if (essential_memory[j] < hugepage_sz)
- essential_memory[j] = 0;
- else
- essential_memory[j] -= hugepage_sz;
- }
-
- RTE_LOG(DEBUG, EAL,
- "Setting policy MPOL_PREFERRED for socket %d\n",
- node_id);
- numa_set_preferred(node_id);
- }
-#endif
-
- hf->file_id = i;
- hf->size = hugepage_sz;
- eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
- hpi->hugedir, hf->file_id);
- hf->filepath[sizeof(hf->filepath) - 1] = '\0';
-
- /* try to create hugepage file */
- fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
- if (fd < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
- strerror(errno));
- goto out;
- }
-
- /* map the segment, and populate page tables,
- * the kernel fills this segment with zeros. we don't care where
- * this gets mapped - we already have contiguous memory areas
- * ready for us to map into.
- */
- virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE, fd, 0);
- if (virtaddr == MAP_FAILED) {
- RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
- strerror(errno));
- close(fd);
- goto out;
- }
-
- hf->orig_va = virtaddr;
-
- /* In linux, hugetlb limitations, like cgroup, are
- * enforced at fault time instead of mmap(), even
- * with the option of MAP_POPULATE. Kernel will send
- * a SIGBUS signal. To avoid to be killed, save stack
- * environment here, if SIGBUS happens, we can jump
- * back here.
- */
- if (huge_wrap_sigsetjmp()) {
- RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
- "hugepages of size %u MB\n",
- (unsigned int)(hugepage_sz / 0x100000));
- munmap(virtaddr, hugepage_sz);
- close(fd);
- unlink(hugepg_tbl[i].filepath);
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (maxnode)
- essential_memory[node_id] =
- essential_prev;
-#endif
- goto out;
- }
- *(int *)virtaddr = 0;
-
- /* set shared lock on the file. */
- if (flock(fd, LOCK_SH) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
- __func__, strerror(errno));
- close(fd);
- goto out;
- }
-
- close(fd);
- }
-
-out:
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (maxnode) {
- RTE_LOG(DEBUG, EAL,
- "Restoring previous memory policy: %d\n", oldpolicy);
- if (oldpolicy == MPOL_DEFAULT) {
- numa_set_localalloc();
- } else if (set_mempolicy(oldpolicy, oldmask->maskp,
- oldmask->size + 1) < 0) {
- RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
- strerror(errno));
- numa_set_localalloc();
- }
- }
- if (oldmask != NULL)
- numa_free_cpumask(oldmask);
-#endif
- return i;
-}
-
-/*
- * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
- * page.
- */
-static int
-find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
- int socket_id;
- char *end, *nodestr;
- unsigned i, hp_count = 0;
- uint64_t virt_addr;
- char buf[BUFSIZ];
- char hugedir_str[PATH_MAX];
- FILE *f;
-
- f = fopen("/proc/self/numa_maps", "r");
- if (f == NULL) {
- RTE_LOG(NOTICE, EAL, "NUMA support not available"
- " consider that all memory is in socket_id 0\n");
- return 0;
- }
-
- snprintf(hugedir_str, sizeof(hugedir_str),
- "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
-
- /* parse numa map */
- while (fgets(buf, sizeof(buf), f) != NULL) {
-
- /* ignore non huge page */
- if (strstr(buf, " huge ") == NULL &&
- strstr(buf, hugedir_str) == NULL)
- continue;
-
- /* get zone addr */
- virt_addr = strtoull(buf, &end, 16);
- if (virt_addr == 0 || end == buf) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
-
- /* get node id (socket id) */
- nodestr = strstr(buf, " N");
- if (nodestr == NULL) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
- nodestr += 2;
- end = strstr(nodestr, "=");
- if (end == NULL) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
- end[0] = '\0';
- end = NULL;
-
- socket_id = strtoul(nodestr, &end, 0);
- if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
- RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
- goto error;
- }
-
- /* if we find this page in our mappings, set socket_id */
- for (i = 0; i < hpi->num_pages[0]; i++) {
- void *va = (void *)(unsigned long)virt_addr;
- if (hugepg_tbl[i].orig_va == va) {
- hugepg_tbl[i].socket_id = socket_id;
- hp_count++;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
- RTE_LOG(DEBUG, EAL,
- "Hugepage %s is on socket %d\n",
- hugepg_tbl[i].filepath, socket_id);
-#endif
- }
- }
- }
-
- if (hp_count < hpi->num_pages[0])
- goto error;
-
- fclose(f);
- return 0;
-
-error:
- fclose(f);
- return -1;
-}
-
-static int
-cmp_physaddr(const void *a, const void *b)
-{
-#ifndef RTE_ARCH_PPC_64
- const struct hugepage_file *p1 = a;
- const struct hugepage_file *p2 = b;
-#else
- /* PowerPC needs memory sorted in reverse order from x86 */
- const struct hugepage_file *p1 = b;
- const struct hugepage_file *p2 = a;
-#endif
- if (p1->physaddr < p2->physaddr)
- return -1;
- else if (p1->physaddr > p2->physaddr)
- return 1;
- else
- return 0;
-}
-
-/*
- * Uses mmap to create a shared memory area for storage of data
- * Used in this file to store the hugepage file map on disk
- */
-static void *
-create_shared_memory(const char *filename, const size_t mem_size)
-{
- void *retval;
- int fd;
-
- /* if no shared files mode is used, create anonymous memory instead */
- if (internal_config.no_shconf) {
- retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (retval == MAP_FAILED)
- return NULL;
- return retval;
- }
-
- fd = open(filename, O_CREAT | O_RDWR, 0600);
- if (fd < 0)
- return NULL;
- if (ftruncate(fd, mem_size) < 0) {
- close(fd);
- return NULL;
- }
- retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- close(fd);
- if (retval == MAP_FAILED)
- return NULL;
- return retval;
-}
-
-/*
- * this copies *active* hugepages from one hugepage table to another.
- * destination is typically the shared memory.
- */
-static int
-copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
- const struct hugepage_file * src, int src_size)
-{
- int src_pos, dst_pos = 0;
-
- for (src_pos = 0; src_pos < src_size; src_pos++) {
- if (src[src_pos].orig_va != NULL) {
- /* error on overflow attempt */
- if (dst_pos == dest_size)
- return -1;
- memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
- dst_pos++;
- }
- }
- return 0;
-}
-
-static int
-unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
- unsigned num_hp_info)
-{
- unsigned socket, size;
- int page, nrpages = 0;
-
- /* get total number of hugepages */
- for (size = 0; size < num_hp_info; size++)
- for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
- nrpages +=
- internal_config.hugepage_info[size].num_pages[socket];
-
- for (page = 0; page < nrpages; page++) {
- struct hugepage_file *hp = &hugepg_tbl[page];
-
- if (hp->orig_va != NULL && unlink(hp->filepath)) {
- RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
- __func__, hp->filepath, strerror(errno));
- }
- }
- return 0;
-}
-
-/*
- * unmaps hugepages that are not going to be used. since we originally allocate
- * ALL hugepages (not just those we need), additional unmapping needs to be done.
- */
-static int
-unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
- struct hugepage_info *hpi,
- unsigned num_hp_info)
-{
- unsigned socket, size;
- int page, nrpages = 0;
-
- /* get total number of hugepages */
- for (size = 0; size < num_hp_info; size++)
- for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
- nrpages += internal_config.hugepage_info[size].num_pages[socket];
-
- for (size = 0; size < num_hp_info; size++) {
- for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
- unsigned pages_found = 0;
-
- /* traverse until we have unmapped all the unused pages */
- for (page = 0; page < nrpages; page++) {
- struct hugepage_file *hp = &hugepg_tbl[page];
-
- /* find a page that matches the criteria */
- if ((hp->size == hpi[size].hugepage_sz) &&
- (hp->socket_id == (int) socket)) {
-
- /* if we skipped enough pages, unmap the rest */
- if (pages_found == hpi[size].num_pages[socket]) {
- uint64_t unmap_len;
-
- unmap_len = hp->size;
-
- /* get start addr and len of the remaining segment */
- munmap(hp->orig_va,
- (size_t)unmap_len);
-
- hp->orig_va = NULL;
- if (unlink(hp->filepath) == -1) {
- RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
- __func__, hp->filepath, strerror(errno));
- return -1;
- }
- } else {
- /* lock the page and skip */
- pages_found++;
- }
-
- } /* match page */
- } /* foreach page */
- } /* foreach socket */
- } /* foreach pagesize */
-
- return 0;
-}
-
-static int
-remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct rte_memseg_list *msl;
- struct rte_fbarray *arr;
- int cur_page, seg_len;
- unsigned int msl_idx;
- int ms_idx;
- uint64_t page_sz;
- size_t memseg_len;
- int socket_id;
-
- page_sz = hugepages[seg_start].size;
- socket_id = hugepages[seg_start].socket_id;
- seg_len = seg_end - seg_start;
-
- RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
- (seg_len * page_sz) >> 20ULL, socket_id);
-
- /* find free space in memseg lists */
- for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
- bool empty;
- msl = &mcfg->memsegs[msl_idx];
- arr = &msl->memseg_arr;
-
- if (msl->page_sz != page_sz)
- continue;
- if (msl->socket_id != socket_id)
- continue;
-
- /* leave space for a hole if array is not empty */
- empty = arr->count == 0;
- ms_idx = rte_fbarray_find_next_n_free(arr, 0,
- seg_len + (empty ? 0 : 1));
-
- /* memseg list is full? */
- if (ms_idx < 0)
- continue;
-
- /* leave some space between memsegs, they are not IOVA
- * contiguous, so they shouldn't be VA contiguous either.
- */
- if (!empty)
- ms_idx++;
- break;
- }
- if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
- RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
- return -1;
- }
-
-#ifdef RTE_ARCH_PPC_64
- /* for PPC64 we go through the list backwards */
- for (cur_page = seg_end - 1; cur_page >= seg_start;
- cur_page--, ms_idx++) {
-#else
- for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
-#endif
- struct hugepage_file *hfile = &hugepages[cur_page];
- struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
- void *addr;
- int fd;
-
- fd = open(hfile->filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
- hfile->filepath, strerror(errno));
- return -1;
- }
- /* set shared lock on the file. */
- if (flock(fd, LOCK_SH) < 0) {
- RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
- hfile->filepath, strerror(errno));
- close(fd);
- return -1;
- }
- memseg_len = (size_t)page_sz;
- addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
-
- /* we know this address is already mmapped by memseg list, so
- * using MAP_FIXED here is safe
- */
- addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
- if (addr == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
- hfile->filepath, strerror(errno));
- close(fd);
- return -1;
- }
-
- /* we have a new address, so unmap previous one */
-#ifndef RTE_ARCH_64
- /* in 32-bit legacy mode, we have already unmapped the page */
- if (!internal_config.legacy_mem)
- munmap(hfile->orig_va, page_sz);
-#else
- munmap(hfile->orig_va, page_sz);
-#endif
-
- hfile->orig_va = NULL;
- hfile->final_va = addr;
-
- /* rewrite physical addresses in IOVA as VA mode */
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- hfile->physaddr = (uintptr_t)addr;
-
- /* set up memseg data */
- ms->addr = addr;
- ms->hugepage_sz = page_sz;
- ms->len = memseg_len;
- ms->iova = hfile->physaddr;
- ms->socket_id = hfile->socket_id;
- ms->nchannel = rte_memory_get_nchannel();
- ms->nrank = rte_memory_get_nrank();
-
- rte_fbarray_set_used(arr, ms_idx);
-
- /* store segment fd internally */
- if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
- RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
- rte_strerror(rte_errno));
- }
- RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
- (seg_len * page_sz) >> 20, socket_id);
- return 0;
-}
-
-static uint64_t
-get_mem_amount(uint64_t page_sz, uint64_t max_mem)
-{
- uint64_t area_sz, max_pages;
-
- /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
- max_pages = RTE_MAX_MEMSEG_PER_LIST;
- max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
-
- area_sz = RTE_MIN(page_sz * max_pages, max_mem);
-
- /* make sure the list isn't smaller than the page size */
- area_sz = RTE_MAX(area_sz, page_sz);
-
- return RTE_ALIGN(area_sz, page_sz);
-}
-
-static int
-free_memseg_list(struct rte_memseg_list *msl)
-{
- if (rte_fbarray_destroy(&msl->memseg_arr)) {
- RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n");
- return -1;
- }
- memset(msl, 0, sizeof(*msl));
- return 0;
-}
-
-#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
-static int
-alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
- int n_segs, int socket_id, int type_msl_idx)
-{
- char name[RTE_FBARRAY_NAME_LEN];
-
- snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
- type_msl_idx);
- if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
- sizeof(struct rte_memseg))) {
- RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
- rte_strerror(rte_errno));
- return -1;
- }
-
- msl->page_sz = page_sz;
- msl->socket_id = socket_id;
- msl->base_va = NULL;
- msl->heap = 1; /* mark it as a heap segment */
-
- RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
- (size_t)page_sz >> 10, socket_id);
-
- return 0;
-}
-
-static int
-alloc_va_space(struct rte_memseg_list *msl)
-{
- uint64_t page_sz;
- size_t mem_sz;
- void *addr;
- int flags = 0;
-
- page_sz = msl->page_sz;
- mem_sz = page_sz * msl->memseg_arr.len;
-
- addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
- if (addr == NULL) {
- if (rte_errno == EADDRNOTAVAIL)
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - "
- "please use '--" OPT_BASE_VIRTADDR "' option\n",
- (unsigned long long)mem_sz, msl->base_va);
- else
- RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
- return -1;
- }
- msl->base_va = addr;
- msl->len = mem_sz;
-
- return 0;
-}
-
-/*
- * Our VA space is not preallocated yet, so preallocate it here. We need to know
- * how many segments there are in order to map all pages into one address space,
- * and leave appropriate holes between segments so that rte_malloc does not
- * concatenate them into one big segment.
- *
- * we also need to unmap original pages to free up address space.
- */
-static int __rte_unused
-prealloc_segments(struct hugepage_file *hugepages, int n_pages)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int cur_page, seg_start_page, end_seg, new_memseg;
- unsigned int hpi_idx, socket, i;
- int n_contig_segs, n_segs;
- int msl_idx;
-
- /* before we preallocate segments, we need to free up our VA space.
- * we're not removing files, and we already have information about
- * PA-contiguousness, so it is safe to unmap everything.
- */
- for (cur_page = 0; cur_page < n_pages; cur_page++) {
- struct hugepage_file *hpi = &hugepages[cur_page];
- munmap(hpi->orig_va, hpi->size);
- hpi->orig_va = NULL;
- }
-
- /* we cannot know how many page sizes and sockets we have discovered, so
- * loop over all of them
- */
- for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
- hpi_idx++) {
- uint64_t page_sz =
- internal_config.hugepage_info[hpi_idx].hugepage_sz;
-
- for (i = 0; i < rte_socket_count(); i++) {
- struct rte_memseg_list *msl;
-
- socket = rte_socket_id_by_idx(i);
- n_contig_segs = 0;
- n_segs = 0;
- seg_start_page = -1;
-
- for (cur_page = 0; cur_page < n_pages; cur_page++) {
- struct hugepage_file *prev, *cur;
- int prev_seg_start_page = -1;
-
- cur = &hugepages[cur_page];
- prev = cur_page == 0 ? NULL :
- &hugepages[cur_page - 1];
-
- new_memseg = 0;
- end_seg = 0;
-
- if (cur->size == 0)
- end_seg = 1;
- else if (cur->socket_id != (int) socket)
- end_seg = 1;
- else if (cur->size != page_sz)
- end_seg = 1;
- else if (cur_page == 0)
- new_memseg = 1;
-#ifdef RTE_ARCH_PPC_64
- /* On PPC64 architecture, the mmap always start
- * from higher address to lower address. Here,
- * physical addresses are in descending order.
- */
- else if ((prev->physaddr - cur->physaddr) !=
- cur->size)
- new_memseg = 1;
-#else
- else if ((cur->physaddr - prev->physaddr) !=
- cur->size)
- new_memseg = 1;
-#endif
- if (new_memseg) {
- /* if we're already inside a segment,
- * new segment means end of current one
- */
- if (seg_start_page != -1) {
- end_seg = 1;
- prev_seg_start_page =
- seg_start_page;
- }
- seg_start_page = cur_page;
- }
-
- if (end_seg) {
- if (prev_seg_start_page != -1) {
- /* we've found a new segment */
- n_contig_segs++;
- n_segs += cur_page -
- prev_seg_start_page;
- } else if (seg_start_page != -1) {
- /* we didn't find new segment,
- * but did end current one
- */
- n_contig_segs++;
- n_segs += cur_page -
- seg_start_page;
- seg_start_page = -1;
- continue;
- } else {
- /* we're skipping this page */
- continue;
- }
- }
- /* segment continues */
- }
- /* check if we missed last segment */
- if (seg_start_page != -1) {
- n_contig_segs++;
- n_segs += cur_page - seg_start_page;
- }
-
- /* if no segments were found, do not preallocate */
- if (n_segs == 0)
- continue;
-
- /* we now have total number of pages that we will
- * allocate for this segment list. add separator pages
- * to the total count, and preallocate VA space.
- */
- n_segs += n_contig_segs - 1;
-
- /* now, preallocate VA space for these segments */
-
- /* first, find suitable memseg list for this */
- for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
- msl_idx++) {
- msl = &mcfg->memsegs[msl_idx];
-
- if (msl->base_va != NULL)
- continue;
- break;
- }
- if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
-
- /* now, allocate fbarray itself */
- if (alloc_memseg_list(msl, page_sz, n_segs, socket,
- msl_idx) < 0)
- return -1;
-
- /* finally, allocate VA space */
- if (alloc_va_space(msl) < 0)
- return -1;
- }
- }
- return 0;
-}
-
-/*
- * We cannot reallocate memseg lists on the fly because PPC64 stores pages
- * backwards, therefore we have to process the entire memseg first before
- * remapping it into memseg list VA space.
- */
-static int
-remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
-{
- int cur_page, seg_start_page, new_memseg, ret;
-
- seg_start_page = 0;
- for (cur_page = 0; cur_page < n_pages; cur_page++) {
- struct hugepage_file *prev, *cur;
-
- new_memseg = 0;
-
- cur = &hugepages[cur_page];
- prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
-
- /* if size is zero, no more pages left */
- if (cur->size == 0)
- break;
-
- if (cur_page == 0)
- new_memseg = 1;
- else if (cur->socket_id != prev->socket_id)
- new_memseg = 1;
- else if (cur->size != prev->size)
- new_memseg = 1;
-#ifdef RTE_ARCH_PPC_64
- /* On PPC64 architecture, the mmap always start from higher
- * address to lower address. Here, physical addresses are in
- * descending order.
- */
- else if ((prev->physaddr - cur->physaddr) != cur->size)
- new_memseg = 1;
-#else
- else if ((cur->physaddr - prev->physaddr) != cur->size)
- new_memseg = 1;
-#endif
-
- if (new_memseg) {
- /* if this isn't the first time, remap segment */
- if (cur_page != 0) {
- ret = remap_segment(hugepages, seg_start_page,
- cur_page);
- if (ret != 0)
- return -1;
- }
- /* remember where we started */
- seg_start_page = cur_page;
- }
- /* continuation of previous memseg */
- }
- /* we were stopped, but we didn't remap the last segment, do it now */
- if (cur_page != 0) {
- ret = remap_segment(hugepages, seg_start_page,
- cur_page);
- if (ret != 0)
- return -1;
- }
- return 0;
-}
-
-__rte_unused /* function is unused on 32-bit builds */
-static inline uint64_t
-get_socket_mem_size(int socket)
-{
- uint64_t size = 0;
- unsigned i;
-
- for (i = 0; i < internal_config.num_hugepage_sizes; i++){
- struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- size += hpi->hugepage_sz * hpi->num_pages[socket];
- }
-
- return size;
-}
-
-/*
- * This function is a NUMA-aware equivalent of calc_num_pages.
- * It takes in the list of hugepage sizes and the
- * number of pages thereof, and calculates the best number of
- * pages of each size to fulfill the request for <memory> ram
- */
-static int
-calc_num_pages_per_socket(uint64_t * memory,
- struct hugepage_info *hp_info,
- struct hugepage_info *hp_used,
- unsigned num_hp_info)
-{
- unsigned socket, j, i = 0;
- unsigned requested, available;
- int total_num_pages = 0;
- uint64_t remaining_mem, cur_mem;
- uint64_t total_mem = internal_config.memory;
-
- if (num_hp_info == 0)
- return -1;
-
- /* if specific memory amounts per socket weren't requested */
- if (internal_config.force_sockets == 0) {
- size_t total_size;
-#ifdef RTE_ARCH_64
- int cpu_per_socket[RTE_MAX_NUMA_NODES];
- size_t default_size;
- unsigned lcore_id;
-
- /* Compute number of cores per socket */
- memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
- RTE_LCORE_FOREACH(lcore_id) {
- cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
- }
-
- /*
- * Automatically spread requested memory amongst detected sockets according
- * to number of cores from cpu mask present on each socket
- */
- total_size = internal_config.memory;
- for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
-
- /* Set memory amount per socket */
- default_size = (internal_config.memory * cpu_per_socket[socket])
- / rte_lcore_count();
-
- /* Limit to maximum available memory on socket */
- default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
-
- /* Update sizes */
- memory[socket] = default_size;
- total_size -= default_size;
- }
-
- /*
- * If some memory is remaining, try to allocate it by getting all
- * available memory from sockets, one after the other
- */
- for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
- /* take whatever is available */
- default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
- total_size);
-
- /* Update sizes */
- memory[socket] += default_size;
- total_size -= default_size;
- }
-#else
- /* in 32-bit mode, allocate all of the memory only on master
- * lcore socket
- */
- total_size = internal_config.memory;
- for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
- socket++) {
- struct rte_config *cfg = rte_eal_get_configuration();
- unsigned int master_lcore_socket;
-
- master_lcore_socket =
- rte_lcore_to_socket_id(cfg->master_lcore);
-
- if (master_lcore_socket != socket)
- continue;
-
- /* Update sizes */
- memory[socket] = total_size;
- break;
- }
-#endif
- }
-
- for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
- /* skips if the memory on specific socket wasn't requested */
- for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
- strlcpy(hp_used[i].hugedir, hp_info[i].hugedir,
- sizeof(hp_used[i].hugedir));
- hp_used[i].num_pages[socket] = RTE_MIN(
- memory[socket] / hp_info[i].hugepage_sz,
- hp_info[i].num_pages[socket]);
-
- cur_mem = hp_used[i].num_pages[socket] *
- hp_used[i].hugepage_sz;
-
- memory[socket] -= cur_mem;
- total_mem -= cur_mem;
-
- total_num_pages += hp_used[i].num_pages[socket];
-
- /* check if we have met all memory requests */
- if (memory[socket] == 0)
- break;
-
- /* check if we have any more pages left at this size, if so
- * move on to next size */
- if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
- continue;
- /* At this point we know that there are more pages available that are
- * bigger than the memory we want, so lets see if we can get enough
- * from other page sizes.
- */
- remaining_mem = 0;
- for (j = i+1; j < num_hp_info; j++)
- remaining_mem += hp_info[j].hugepage_sz *
- hp_info[j].num_pages[socket];
-
- /* is there enough other memory, if not allocate another page and quit */
- if (remaining_mem < memory[socket]){
- cur_mem = RTE_MIN(memory[socket],
- hp_info[i].hugepage_sz);
- memory[socket] -= cur_mem;
- total_mem -= cur_mem;
- hp_used[i].num_pages[socket]++;
- total_num_pages++;
- break; /* we are done with this socket*/
- }
- }
- /* if we didn't satisfy all memory requirements per socket */
- if (memory[socket] > 0 &&
- internal_config.socket_mem[socket] != 0) {
- /* to prevent icc errors */
- requested = (unsigned) (internal_config.socket_mem[socket] /
- 0x100000);
- available = requested -
- ((unsigned) (memory[socket] / 0x100000));
- RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
- "Requested: %uMB, available: %uMB\n", socket,
- requested, available);
- return -1;
- }
- }
-
- /* if we didn't satisfy total memory requirements */
- if (total_mem > 0) {
- requested = (unsigned) (internal_config.memory / 0x100000);
- available = requested - (unsigned) (total_mem / 0x100000);
- RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
- " available: %uMB\n", requested, available);
- return -1;
- }
- return total_num_pages;
-}
-
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
- uint64_t size = 0;
- unsigned i, j;
-
- for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
- struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
- for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
- size += hpi->hugepage_sz * hpi->num_pages[j];
- }
- }
- }
-
- return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
-static struct sigaction huge_action_old;
-static int huge_need_recover;
-
-static void
-huge_register_sigbus(void)
-{
- sigset_t mask;
- struct sigaction action;
-
- sigemptyset(&mask);
- sigaddset(&mask, SIGBUS);
- action.sa_flags = 0;
- action.sa_mask = mask;
- action.sa_handler = huge_sigbus_handler;
-
- huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
-}
-
-static void
-huge_recover_sigbus(void)
-{
- if (huge_need_recover) {
- sigaction(SIGBUS, &huge_action_old, NULL);
- huge_need_recover = 0;
- }
-}
-
-/*
- * Prepare physical memory mapping: fill configuration structure with
- * these infos, return 0 on success.
- * 1. map N huge pages in separate files in hugetlbfs
- * 2. find associated physical addr
- * 3. find associated NUMA socket ID
- * 4. sort all huge pages by physical address
- * 5. remap these N huge pages in the correct order
- * 6. unmap the first mapping
- * 7. fill memsegs in configuration with contiguous zones
- */
-static int
-eal_legacy_hugepage_init(void)
-{
- struct rte_mem_config *mcfg;
- struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
- struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
- struct rte_fbarray *arr;
- struct rte_memseg *ms;
-
- uint64_t memory[RTE_MAX_NUMA_NODES];
-
- unsigned hp_offset;
- int i, j;
- int nr_hugefiles, nr_hugepages = 0;
- void *addr;
-
- memset(used_hp, 0, sizeof(used_hp));
-
- /* get pointer to global configuration */
- mcfg = rte_eal_get_configuration()->mem_config;
-
- /* hugetlbfs can be disabled */
- if (internal_config.no_hugetlbfs) {
- void *prealloc_addr;
- size_t mem_sz;
- struct rte_memseg_list *msl;
- int n_segs, cur_seg, fd, flags;
-#ifdef MEMFD_SUPPORTED
- int memfd;
-#endif
- uint64_t page_sz;
-
- /* nohuge mode is legacy mode */
- internal_config.legacy_mem = 1;
-
- /* nohuge mode is single-file segments mode */
- internal_config.single_file_segments = 1;
-
- /* create a memseg list */
- msl = &mcfg->memsegs[0];
-
- page_sz = RTE_PGSIZE_4K;
- n_segs = internal_config.memory / page_sz;
-
- if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
- sizeof(struct rte_memseg))) {
- RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
- return -1;
- }
-
- /* set up parameters for anonymous mmap */
- fd = -1;
- flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
-#ifdef MEMFD_SUPPORTED
- /* create a memfd and store it in the segment fd table */
- memfd = memfd_create("nohuge", 0);
- if (memfd < 0) {
- RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
- strerror(errno));
- RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
- } else {
- /* we got an fd - now resize it */
- if (ftruncate(memfd, internal_config.memory) < 0) {
- RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
- strerror(errno));
- RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
- close(memfd);
- } else {
- /* creating memfd-backed file was successful.
- * we want changes to memfd to be visible to
- * other processes (such as vhost backend), so
- * map it as shared memory.
- */
- RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
- fd = memfd;
- flags = MAP_SHARED;
- }
- }
-#endif
- /* preallocate address space for the memory, so that it can be
- * fit into the DMA mask.
- */
- mem_sz = internal_config.memory;
- prealloc_addr = eal_get_virtual_area(
- NULL, &mem_sz, page_sz, 0, 0);
- if (prealloc_addr == NULL) {
- RTE_LOG(ERR, EAL,
- "%s: reserving memory area failed: "
- "%s\n",
- __func__, strerror(errno));
- return -1;
- }
- addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
- flags | MAP_FIXED, fd, 0);
- if (addr == MAP_FAILED || addr != prealloc_addr) {
- RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
- strerror(errno));
- munmap(prealloc_addr, mem_sz);
- return -1;
- }
- msl->base_va = addr;
- msl->page_sz = page_sz;
- msl->socket_id = 0;
- msl->len = mem_sz;
- msl->heap = 1;
-
- /* we're in single-file segments mode, so only the segment list
- * fd needs to be set up.
- */
- if (fd != -1) {
- if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
- RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
- /* not a serious error, proceed */
- }
- }
-
- /* populate memsegs. each memseg is one page long */
- for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
- arr = &msl->memseg_arr;
-
- ms = rte_fbarray_get(arr, cur_seg);
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- ms->iova = (uintptr_t)addr;
- else
- ms->iova = RTE_BAD_IOVA;
- ms->addr = addr;
- ms->hugepage_sz = page_sz;
- ms->socket_id = 0;
- ms->len = page_sz;
-
- rte_fbarray_set_used(arr, cur_seg);
-
- addr = RTE_PTR_ADD(addr, (size_t)page_sz);
- }
- if (mcfg->dma_maskbits &&
- rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
- RTE_LOG(ERR, EAL,
- "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
- __func__);
- if (rte_eal_iova_mode() == RTE_IOVA_VA &&
- rte_eal_using_phys_addrs())
- RTE_LOG(ERR, EAL,
- "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
- __func__);
- goto fail;
- }
- return 0;
- }
-
- /* calculate total number of hugepages available. at this point we haven't
- * yet started sorting them so they all are on socket 0 */
- for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
- /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
- used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
-
- nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
- }
-
- /*
- * allocate a memory area for hugepage table.
- * this isn't shared memory yet. due to the fact that we need some
- * processing done on these pages, shared memory will be created
- * at a later stage.
- */
- tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
- if (tmp_hp == NULL)
- goto fail;
-
- memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
-
- hp_offset = 0; /* where we start the current page size entries */
-
- huge_register_sigbus();
-
- /* make a copy of socket_mem, needed for balanced allocation. */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
- memory[i] = internal_config.socket_mem[i];
-
- /* map all hugepages and sort them */
- for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
- unsigned pages_old, pages_new;
- struct hugepage_info *hpi;
-
- /*
- * we don't yet mark hugepages as used at this stage, so
- * we just map all hugepages available to the system
- * all hugepages are still located on socket 0
- */
- hpi = &internal_config.hugepage_info[i];
-
- if (hpi->num_pages[0] == 0)
- continue;
-
- /* map all hugepages available */
- pages_old = hpi->num_pages[0];
- pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
- if (pages_new < pages_old) {
- RTE_LOG(DEBUG, EAL,
- "%d not %d hugepages of size %u MB allocated\n",
- pages_new, pages_old,
- (unsigned)(hpi->hugepage_sz / 0x100000));
-
- int pages = pages_old - pages_new;
-
- nr_hugepages -= pages;
- hpi->num_pages[0] = pages_new;
- if (pages_new == 0)
- continue;
- }
-
- if (rte_eal_using_phys_addrs() &&
- rte_eal_iova_mode() != RTE_IOVA_VA) {
- /* find physical addresses for each hugepage */
- if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
- RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
- "for %u MB pages\n",
- (unsigned int)(hpi->hugepage_sz / 0x100000));
- goto fail;
- }
- } else {
- /* set physical addresses for each hugepage */
- if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
- RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
- "for %u MB pages\n",
- (unsigned int)(hpi->hugepage_sz / 0x100000));
- goto fail;
- }
- }
-
- if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
- RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
- (unsigned)(hpi->hugepage_sz / 0x100000));
- goto fail;
- }
-
- qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
- sizeof(struct hugepage_file), cmp_physaddr);
-
- /* we have processed a num of hugepages of this size, so inc offset */
- hp_offset += hpi->num_pages[0];
- }
-
- huge_recover_sigbus();
-
- if (internal_config.memory == 0 && internal_config.force_sockets == 0)
- internal_config.memory = eal_get_hugepage_mem_size();
-
- nr_hugefiles = nr_hugepages;
-
-
- /* clean out the numbers of pages */
- for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
- for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
- internal_config.hugepage_info[i].num_pages[j] = 0;
-
- /* get hugepages for each socket */
- for (i = 0; i < nr_hugefiles; i++) {
- int socket = tmp_hp[i].socket_id;
-
- /* find a hugepage info with right size and increment num_pages */
- const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
- (int)internal_config.num_hugepage_sizes);
- for (j = 0; j < nb_hpsizes; j++) {
- if (tmp_hp[i].size ==
- internal_config.hugepage_info[j].hugepage_sz) {
- internal_config.hugepage_info[j].num_pages[socket]++;
- }
- }
- }
-
- /* make a copy of socket_mem, needed for number of pages calculation */
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
- memory[i] = internal_config.socket_mem[i];
-
- /* calculate final number of pages */
- nr_hugepages = calc_num_pages_per_socket(memory,
- internal_config.hugepage_info, used_hp,
- internal_config.num_hugepage_sizes);
-
- /* error if not enough memory available */
- if (nr_hugepages < 0)
- goto fail;
-
- /* reporting in! */
- for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
- for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
- if (used_hp[i].num_pages[j] > 0) {
- RTE_LOG(DEBUG, EAL,
- "Requesting %u pages of size %uMB"
- " from socket %i\n",
- used_hp[i].num_pages[j],
- (unsigned)
- (used_hp[i].hugepage_sz / 0x100000),
- j);
- }
- }
- }
-
- /* create shared memory */
- hugepage = create_shared_memory(eal_hugepage_data_path(),
- nr_hugefiles * sizeof(struct hugepage_file));
-
- if (hugepage == NULL) {
- RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
- goto fail;
- }
- memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
-
- /*
- * unmap pages that we won't need (looks at used_hp).
- * also, sets final_va to NULL on pages that were unmapped.
- */
- if (unmap_unneeded_hugepages(tmp_hp, used_hp,
- internal_config.num_hugepage_sizes) < 0) {
- RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
- goto fail;
- }
-
- /*
- * copy stuff from malloc'd hugepage* to the actual shared memory.
- * this procedure only copies those hugepages that have orig_va
- * not NULL. has overflow protection.
- */
- if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
- tmp_hp, nr_hugefiles) < 0) {
- RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
- goto fail;
- }
-
-#ifndef RTE_ARCH_64
- /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
- if (internal_config.legacy_mem &&
- prealloc_segments(hugepage, nr_hugefiles)) {
- RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
- goto fail;
- }
-#endif
-
- /* remap all pages we do need into memseg list VA space, so that those
- * pages become first-class citizens in DPDK memory subsystem
- */
- if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
- RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
- goto fail;
- }
-
- /* free the hugepage backing files */
- if (internal_config.hugepage_unlink &&
- unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
- RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
- goto fail;
- }
-
- /* free the temporary hugepage table */
- free(tmp_hp);
- tmp_hp = NULL;
-
- munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
- hugepage = NULL;
-
- /* we're not going to allocate more pages, so release VA space for
- * unused memseg lists
- */
- for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
- struct rte_memseg_list *msl = &mcfg->memsegs[i];
- size_t mem_sz;
-
- /* skip inactive lists */
- if (msl->base_va == NULL)
- continue;
- /* skip lists where there is at least one page allocated */
- if (msl->memseg_arr.count > 0)
- continue;
- /* this is an unused list, deallocate it */
- mem_sz = msl->len;
- munmap(msl->base_va, mem_sz);
- msl->base_va = NULL;
- msl->heap = 0;
-
- /* destroy backing fbarray */
- rte_fbarray_destroy(&msl->memseg_arr);
- }
-
- if (mcfg->dma_maskbits &&
- rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
- RTE_LOG(ERR, EAL,
- "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
- __func__);
- goto fail;
- }
-
- return 0;
-
-fail:
- huge_recover_sigbus();
- free(tmp_hp);
- if (hugepage != NULL)
- munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-
- return -1;
-}
-
-static int __rte_unused
-hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
-{
- struct hugepage_info *hpi = arg;
-
- if (msl->page_sz != hpi->hugepage_sz)
- return 0;
-
- hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
- return 0;
-}
-
-static int
-limits_callback(int socket_id, size_t cur_limit, size_t new_len)
-{
- RTE_SET_USED(socket_id);
- RTE_SET_USED(cur_limit);
- RTE_SET_USED(new_len);
- return -1;
-}
-
-static int
-eal_hugepage_init(void)
-{
- struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
- uint64_t memory[RTE_MAX_NUMA_NODES];
- int hp_sz_idx, socket_id;
-
- memset(used_hp, 0, sizeof(used_hp));
-
- for (hp_sz_idx = 0;
- hp_sz_idx < (int) internal_config.num_hugepage_sizes;
- hp_sz_idx++) {
-#ifndef RTE_ARCH_64
- struct hugepage_info dummy;
- unsigned int i;
-#endif
- /* also initialize used_hp hugepage sizes in used_hp */
- struct hugepage_info *hpi;
- hpi = &internal_config.hugepage_info[hp_sz_idx];
- used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
-
-#ifndef RTE_ARCH_64
- /* for 32-bit, limit number of pages on socket to whatever we've
- * preallocated, as we cannot allocate more.
- */
- memset(&dummy, 0, sizeof(dummy));
- dummy.hugepage_sz = hpi->hugepage_sz;
- if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
- return -1;
-
- for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
- hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
- dummy.num_pages[i]);
- }
-#endif
- }
-
- /* make a copy of socket_mem, needed for balanced allocation. */
- for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
- memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
-
- /* calculate final number of pages */
- if (calc_num_pages_per_socket(memory,
- internal_config.hugepage_info, used_hp,
- internal_config.num_hugepage_sizes) < 0)
- return -1;
-
- for (hp_sz_idx = 0;
- hp_sz_idx < (int)internal_config.num_hugepage_sizes;
- hp_sz_idx++) {
- for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
- socket_id++) {
- struct rte_memseg **pages;
- struct hugepage_info *hpi = &used_hp[hp_sz_idx];
- unsigned int num_pages = hpi->num_pages[socket_id];
- unsigned int num_pages_alloc;
-
- if (num_pages == 0)
- continue;
-
- RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
- num_pages, hpi->hugepage_sz >> 20, socket_id);
-
- /* we may not be able to allocate all pages in one go,
- * because we break up our memory map into multiple
- * memseg lists. therefore, try allocating multiple
- * times and see if we can get the desired number of
- * pages from multiple allocations.
- */
-
- num_pages_alloc = 0;
- do {
- int i, cur_pages, needed;
-
- needed = num_pages - num_pages_alloc;
-
- pages = malloc(sizeof(*pages) * needed);
-
- /* do not request exact number of pages */
- cur_pages = eal_memalloc_alloc_seg_bulk(pages,
- needed, hpi->hugepage_sz,
- socket_id, false);
- if (cur_pages <= 0) {
- free(pages);
- return -1;
- }
-
- /* mark preallocated pages as unfreeable */
- for (i = 0; i < cur_pages; i++) {
- struct rte_memseg *ms = pages[i];
- ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
- }
- free(pages);
-
- num_pages_alloc += cur_pages;
- } while (num_pages_alloc != num_pages);
- }
- }
- /* if socket limits were specified, set them */
- if (internal_config.force_socket_limits) {
- unsigned int i;
- for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
- uint64_t limit = internal_config.socket_limit[i];
- if (limit == 0)
- continue;
- if (rte_mem_alloc_validator_register("socket-limit",
- limits_callback, i, limit))
- RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
- }
- }
- return 0;
-}
-
-/*
- * uses fstat to report the size of a file on disk
- */
-static off_t
-getFileSize(int fd)
-{
- struct stat st;
- if (fstat(fd, &st) < 0)
- return 0;
- return st.st_size;
-}
-
-/*
- * This creates the memory mappings in the secondary process to match that of
- * the server process. It goes through each memory segment in the DPDK runtime
- * configuration and finds the hugepages which form that segment, mapping them
- * in order to form a contiguous block in the virtual memory space
- */
-static int
-eal_legacy_hugepage_attach(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct hugepage_file *hp = NULL;
- unsigned int num_hp = 0;
- unsigned int i = 0;
- unsigned int cur_seg;
- off_t size = 0;
- int fd, fd_hugepage = -1;
-
- if (aslr_enabled() > 0) {
- RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
- "(ASLR) is enabled in the kernel.\n");
- RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory "
- "into secondary processes\n");
- }
-
- fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
- if (fd_hugepage < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- eal_hugepage_data_path());
- goto error;
- }
-
- size = getFileSize(fd_hugepage);
- hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
- if (hp == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- eal_hugepage_data_path());
- goto error;
- }
-
- num_hp = size / sizeof(struct hugepage_file);
- RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
-
- /* map all segments into memory to make sure we get the addrs. the
- * segments themselves are already in memseg list (which is shared and
- * has its VA space already preallocated), so we just need to map
- * everything into correct addresses.
- */
- for (i = 0; i < num_hp; i++) {
- struct hugepage_file *hf = &hp[i];
- size_t map_sz = hf->size;
- void *map_addr = hf->final_va;
- int msl_idx, ms_idx;
- struct rte_memseg_list *msl;
- struct rte_memseg *ms;
-
- /* if size is zero, no more pages left */
- if (map_sz == 0)
- break;
-
- fd = open(hf->filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
- hf->filepath, strerror(errno));
- goto error;
- }
-
- map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FIXED, fd, 0);
- if (map_addr == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
- hf->filepath, strerror(errno));
- goto fd_error;
- }
-
- /* set shared lock on the file. */
- if (flock(fd, LOCK_SH) < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
- __func__, strerror(errno));
- goto mmap_error;
- }
-
- /* find segment data */
- msl = rte_mem_virt2memseg_list(map_addr);
- if (msl == NULL) {
- RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
- __func__);
- goto mmap_error;
- }
- ms = rte_mem_virt2memseg(map_addr, msl);
- if (ms == NULL) {
- RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
- __func__);
- goto mmap_error;
- }
-
- msl_idx = msl - mcfg->memsegs;
- ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
- if (ms_idx < 0) {
- RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
- __func__);
- goto mmap_error;
- }
-
- /* store segment fd internally */
- if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
- RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
- rte_strerror(rte_errno));
- }
- /* unmap the hugepage config file, since we are done using it */
- munmap(hp, size);
- close(fd_hugepage);
- return 0;
-
-mmap_error:
- munmap(hp[i].final_va, hp[i].size);
-fd_error:
- close(fd);
-error:
- /* unwind mmap's done so far */
- for (cur_seg = 0; cur_seg < i; cur_seg++)
- munmap(hp[cur_seg].final_va, hp[cur_seg].size);
-
- if (hp != NULL && hp != MAP_FAILED)
- munmap(hp, size);
- if (fd_hugepage >= 0)
- close(fd_hugepage);
- return -1;
-}
-
-static int
-eal_hugepage_attach(void)
-{
- if (eal_memalloc_sync_with_primary()) {
- RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
- if (aslr_enabled() > 0)
- RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
- return -1;
- }
- return 0;
-}
-
-int
-rte_eal_hugepage_init(void)
-{
- return internal_config.legacy_mem ?
- eal_legacy_hugepage_init() :
- eal_hugepage_init();
-}
-
-int
-rte_eal_hugepage_attach(void)
-{
- return internal_config.legacy_mem ?
- eal_legacy_hugepage_attach() :
- eal_hugepage_attach();
-}
-
-int
-rte_eal_using_phys_addrs(void)
-{
- if (phys_addrs_available == -1) {
- uint64_t tmp = 0;
-
- if (rte_eal_has_hugepages() != 0 &&
- rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
- phys_addrs_available = 1;
- else
- phys_addrs_available = 0;
- }
- return phys_addrs_available;
-}
-
-static int __rte_unused
-memseg_primary_init_32(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int active_sockets, hpi_idx, msl_idx = 0;
- unsigned int socket_id, i;
- struct rte_memseg_list *msl;
- uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
- uint64_t max_mem;
-
- /* no-huge does not need this at all */
- if (internal_config.no_hugetlbfs)
- return 0;
-
- /* this is a giant hack, but desperate times call for desperate
- * measures. in legacy 32-bit mode, we cannot preallocate VA space,
- * because having upwards of 2 gigabytes of VA space already mapped will
- * interfere with our ability to map and sort hugepages.
- *
- * therefore, in legacy 32-bit mode, we will be initializing memseg
- * lists much later - in eal_memory.c, right after we unmap all the
- * unneeded pages. this will not affect secondary processes, as those
- * should be able to mmap the space without (too many) problems.
- */
- if (internal_config.legacy_mem)
- return 0;
-
- /* 32-bit mode is a very special case. we cannot know in advance where
- * the user will want to allocate their memory, so we have to do some
- * heuristics.
- */
- active_sockets = 0;
- total_requested_mem = 0;
- if (internal_config.force_sockets)
- for (i = 0; i < rte_socket_count(); i++) {
- uint64_t mem;
-
- socket_id = rte_socket_id_by_idx(i);
- mem = internal_config.socket_mem[socket_id];
-
- if (mem == 0)
- continue;
-
- active_sockets++;
- total_requested_mem += mem;
- }
- else
- total_requested_mem = internal_config.memory;
-
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- if (total_requested_mem > max_mem) {
- RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
- (unsigned int)(max_mem >> 20));
- return -1;
- }
- total_extra_mem = max_mem - total_requested_mem;
- extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
- total_extra_mem / active_sockets;
-
- /* the allocation logic is a little bit convoluted, but here's how it
- * works, in a nutshell:
- * - if user hasn't specified on which sockets to allocate memory via
- * --socket-mem, we allocate all of our memory on master core socket.
- * - if user has specified sockets to allocate memory on, there may be
- * some "unused" memory left (e.g. if user has specified --socket-mem
- * such that not all memory adds up to 2 gigabytes), so add it to all
- * sockets that are in use equally.
- *
- * page sizes are sorted by size in descending order, so we can safely
- * assume that we dispense with bigger page sizes first.
- */
-
- /* create memseg lists */
- for (i = 0; i < rte_socket_count(); i++) {
- int hp_sizes = (int) internal_config.num_hugepage_sizes;
- uint64_t max_socket_mem, cur_socket_mem;
- unsigned int master_lcore_socket;
- struct rte_config *cfg = rte_eal_get_configuration();
- bool skip;
-
- socket_id = rte_socket_id_by_idx(i);
-
-#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
- /* we can still sort pages by socket in legacy mode */
- if (!internal_config.legacy_mem && socket_id > 0)
- break;
-#endif
-
- /* if we didn't specifically request memory on this socket */
- skip = active_sockets != 0 &&
- internal_config.socket_mem[socket_id] == 0;
- /* ...or if we didn't specifically request memory on *any*
- * socket, and this is not master lcore
- */
- master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
- skip |= active_sockets == 0 && socket_id != master_lcore_socket;
-
- if (skip) {
- RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
- socket_id);
- continue;
- }
-
- /* max amount of memory on this socket */
- max_socket_mem = (active_sockets != 0 ?
- internal_config.socket_mem[socket_id] :
- internal_config.memory) +
- extra_mem_per_socket;
- cur_socket_mem = 0;
-
- for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
- uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
- uint64_t hugepage_sz;
- struct hugepage_info *hpi;
- int type_msl_idx, max_segs, total_segs = 0;
-
- hpi = &internal_config.hugepage_info[hpi_idx];
- hugepage_sz = hpi->hugepage_sz;
-
- /* check if pages are actually available */
- if (hpi->num_pages[socket_id] == 0)
- continue;
-
- max_segs = RTE_MAX_MEMSEG_PER_TYPE;
- max_pagesz_mem = max_socket_mem - cur_socket_mem;
-
- /* make it multiple of page size */
- max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
- hugepage_sz);
-
- RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
- "%" PRIu64 "M on socket %i\n",
- max_pagesz_mem >> 20, socket_id);
-
- type_msl_idx = 0;
- while (cur_pagesz_mem < max_pagesz_mem &&
- total_segs < max_segs) {
- uint64_t cur_mem;
- unsigned int n_segs;
-
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- return -1;
- }
-
- msl = &mcfg->memsegs[msl_idx];
-
- cur_mem = get_mem_amount(hugepage_sz,
- max_pagesz_mem);
- n_segs = cur_mem / hugepage_sz;
-
- if (alloc_memseg_list(msl, hugepage_sz, n_segs,
- socket_id, type_msl_idx)) {
- /* failing to allocate a memseg list is
- * a serious error.
- */
- RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
- return -1;
- }
-
- if (alloc_va_space(msl)) {
- /* if we couldn't allocate VA space, we
- * can try with smaller page sizes.
- */
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n");
- /* deallocate memseg list */
- if (free_memseg_list(msl))
- return -1;
- break;
- }
-
- total_segs += msl->memseg_arr.len;
- cur_pagesz_mem = total_segs * hugepage_sz;
- type_msl_idx++;
- msl_idx++;
- }
- cur_socket_mem += cur_pagesz_mem;
- }
- if (cur_socket_mem == 0) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n",
- socket_id);
- return -1;
- }
- }
-
- return 0;
-}
-
-static int __rte_unused
-memseg_primary_init(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct memtype {
- uint64_t page_sz;
- int socket_id;
- } *memtypes = NULL;
- int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
- struct rte_memseg_list *msl;
- uint64_t max_mem, max_mem_per_type;
- unsigned int max_seglists_per_type;
- unsigned int n_memtypes, cur_type;
-
- /* no-huge does not need this at all */
- if (internal_config.no_hugetlbfs)
- return 0;
-
- /*
- * figuring out amount of memory we're going to have is a long and very
- * involved process. the basic element we're operating with is a memory
- * type, defined as a combination of NUMA node ID and page size (so that
- * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
- *
- * deciding amount of memory going towards each memory type is a
- * balancing act between maximum segments per type, maximum memory per
- * type, and number of detected NUMA nodes. the goal is to make sure
- * each memory type gets at least one memseg list.
- *
- * the total amount of memory is limited by RTE_MAX_MEM_MB value.
- *
- * the total amount of memory per type is limited by either
- * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
- * of detected NUMA nodes. additionally, maximum number of segments per
- * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
- * smaller page sizes, it can take hundreds of thousands of segments to
- * reach the above specified per-type memory limits.
- *
- * additionally, each type may have multiple memseg lists associated
- * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
- * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
- *
- * the number of memseg lists per type is decided based on the above
- * limits, and also taking number of detected NUMA nodes, to make sure
- * that we don't run out of memseg lists before we populate all NUMA
- * nodes with memory.
- *
- * we do this in three stages. first, we collect the number of types.
- * then, we figure out memory constraints and populate the list of
- * would-be memseg lists. then, we go ahead and allocate the memseg
- * lists.
- */
-
- /* create space for mem types */
- n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
- memtypes = calloc(n_memtypes, sizeof(*memtypes));
- if (memtypes == NULL) {
- RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
- return -1;
- }
-
- /* populate mem types */
- cur_type = 0;
- for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
- hpi_idx++) {
- struct hugepage_info *hpi;
- uint64_t hugepage_sz;
-
- hpi = &internal_config.hugepage_info[hpi_idx];
- hugepage_sz = hpi->hugepage_sz;
-
- for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
- int socket_id = rte_socket_id_by_idx(i);
-
-#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
- /* we can still sort pages by socket in legacy mode */
- if (!internal_config.legacy_mem && socket_id > 0)
- break;
-#endif
- memtypes[cur_type].page_sz = hugepage_sz;
- memtypes[cur_type].socket_id = socket_id;
-
- RTE_LOG(DEBUG, EAL, "Detected memory type: "
- "socket_id:%u hugepage_sz:%" PRIu64 "\n",
- socket_id, hugepage_sz);
- }
- }
- /* number of memtypes could have been lower due to no NUMA support */
- n_memtypes = cur_type;
-
- /* set up limits for types */
- max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
- max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
- max_mem / n_memtypes);
- /*
- * limit maximum number of segment lists per type to ensure there's
- * space for memseg lists for all NUMA nodes with all page sizes
- */
- max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
-
- if (max_seglists_per_type == 0) {
- RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- goto out;
- }
-
- /* go through all mem types and create segment lists */
- msl_idx = 0;
- for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
- unsigned int cur_seglist, n_seglists, n_segs;
- unsigned int max_segs_per_type, max_segs_per_list;
- struct memtype *type = &memtypes[cur_type];
- uint64_t max_mem_per_list, pagesz;
- int socket_id;
-
- pagesz = type->page_sz;
- socket_id = type->socket_id;
-
- /*
- * we need to create segment lists for this type. we must take
- * into account the following things:
- *
- * 1. total amount of memory we can use for this memory type
- * 2. total amount of memory per memseg list allowed
- * 3. number of segments needed to fit the amount of memory
- * 4. number of segments allowed per type
- * 5. number of segments allowed per memseg list
- * 6. number of memseg lists we are allowed to take up
- */
-
- /* calculate how much segments we will need in total */
- max_segs_per_type = max_mem_per_type / pagesz;
- /* limit number of segments to maximum allowed per type */
- max_segs_per_type = RTE_MIN(max_segs_per_type,
- (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
- /* limit number of segments to maximum allowed per list */
- max_segs_per_list = RTE_MIN(max_segs_per_type,
- (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
-
- /* calculate how much memory we can have per segment list */
- max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
- (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
-
- /* calculate how many segments each segment list will have */
- n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
-
- /* calculate how many segment lists we can have */
- n_seglists = RTE_MIN(max_segs_per_type / n_segs,
- max_mem_per_type / max_mem_per_list);
-
- /* limit number of segment lists according to our maximum */
- n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
-
- RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
- "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
- n_seglists, n_segs, socket_id, pagesz);
-
- /* create all segment lists */
- for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
- if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
- RTE_LOG(ERR, EAL,
- "No more space in memseg lists, please increase %s\n",
- RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
- goto out;
- }
- msl = &mcfg->memsegs[msl_idx++];
-
- if (alloc_memseg_list(msl, pagesz, n_segs,
- socket_id, cur_seglist))
- goto out;
-
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
- goto out;
- }
- }
- }
- /* we're successful */
- ret = 0;
-out:
- free(memtypes);
- return ret;
-}
-
-static int
-memseg_secondary_init(void)
-{
- struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- int msl_idx = 0;
- struct rte_memseg_list *msl;
-
- for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
-
- msl = &mcfg->memsegs[msl_idx];
-
- /* skip empty memseg lists */
- if (msl->memseg_arr.len == 0)
- continue;
-
- if (rte_fbarray_attach(&msl->memseg_arr)) {
- RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
- return -1;
- }
-
- /* preallocate VA space */
- if (alloc_va_space(msl)) {
- RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-int
-rte_eal_memseg_init(void)
-{
- /* increase rlimit to maximum */
- struct rlimit lim;
-
- if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
- /* set limit to maximum */
- lim.rlim_cur = lim.rlim_max;
-
- if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
- RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
- strerror(errno));
- } else {
- RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
- PRIu64 "\n",
- (uint64_t)lim.rlim_cur);
- }
- } else {
- RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
- }
-#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
- if (!internal_config.legacy_mem && rte_socket_count() > 1) {
- RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n");
- RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n");
- RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n");
- }
-#endif
-
- return rte_eal_process_type() == RTE_PROC_PRIMARY ?
-#ifndef RTE_ARCH_64
- memseg_primary_init_32() :
-#else
- memseg_primary_init() :
-#endif
- memseg_secondary_init();
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <sched.h>
-#include <sys/queue.h>
-#include <sys/syscall.h>
-
-#include <rte_debug.h>
-#include <rte_atomic.h>
-#include <rte_launch.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_per_lcore.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-
-RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
-RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
-RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
-
-/*
- * Send a message to a slave lcore identified by slave_id to call a
- * function f with argument arg. Once the execution is done, the
- * remote lcore switch in FINISHED state.
- */
-int
-rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
-{
- int n;
- char c = 0;
- int m2s = lcore_config[slave_id].pipe_master2slave[1];
- int s2m = lcore_config[slave_id].pipe_slave2master[0];
-
- if (lcore_config[slave_id].state != WAIT)
- return -EBUSY;
-
- lcore_config[slave_id].f = f;
- lcore_config[slave_id].arg = arg;
-
- /* send message */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = write(m2s, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- /* wait ack */
- do {
- n = read(s2m, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- return 0;
-}
-
-/* set affinity for current EAL thread */
-static int
-eal_thread_set_affinity(void)
-{
- unsigned lcore_id = rte_lcore_id();
-
- /* acquire system unique id */
- rte_gettid();
-
- /* update EAL thread core affinity */
- return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
-}
-
-void eal_thread_init_master(unsigned lcore_id)
-{
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-
- /* set CPU affinity */
- if (eal_thread_set_affinity() < 0)
- rte_panic("cannot set affinity\n");
-}
-
-/* main loop of threads */
-__attribute__((noreturn)) void *
-eal_thread_loop(__attribute__((unused)) void *arg)
-{
- char c;
- int n, ret;
- unsigned lcore_id;
- pthread_t thread_id;
- int m2s, s2m;
- char cpuset[RTE_CPU_AFFINITY_STR_LEN];
-
- thread_id = pthread_self();
-
- /* retrieve our lcore_id from the configuration structure */
- RTE_LCORE_FOREACH_SLAVE(lcore_id) {
- if (thread_id == lcore_config[lcore_id].thread_id)
- break;
- }
- if (lcore_id == RTE_MAX_LCORE)
- rte_panic("cannot retrieve lcore id\n");
-
- m2s = lcore_config[lcore_id].pipe_master2slave[0];
- s2m = lcore_config[lcore_id].pipe_slave2master[1];
-
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-
- /* set CPU affinity */
- if (eal_thread_set_affinity() < 0)
- rte_panic("cannot set affinity\n");
-
- ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
- lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
-
- /* read on our pipe to get commands */
- while (1) {
- void *fct_arg;
-
- /* wait command */
- do {
- n = read(m2s, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- lcore_config[lcore_id].state = RUNNING;
-
- /* send ack */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = write(s2m, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- if (lcore_config[lcore_id].f == NULL)
- rte_panic("NULL function pointer\n");
-
- /* call the function and store the return value */
- fct_arg = lcore_config[lcore_id].arg;
- ret = lcore_config[lcore_id].f(fct_arg);
- lcore_config[lcore_id].ret = ret;
- rte_wmb();
-
- /* when a service core returns, it should go directly to WAIT
- * state, because the application will not lcore_wait() for it.
- */
- if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
- lcore_config[lcore_id].state = WAIT;
- else
- lcore_config[lcore_id].state = FINISHED;
- }
-
- /* never reached */
- /* pthread_exit(NULL); */
- /* return NULL; */
-}
-
-/* require calling thread tid by gettid() */
-int rte_sys_gettid(void)
-{
- return (int)syscall(SYS_gettid);
-}
-
-int rte_thread_setname(pthread_t id, const char *name)
-{
- int ret = ENOSYS;
-#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 12)
- ret = pthread_setname_np(id, name);
-#endif
-#endif
- RTE_SET_USED(id);
- RTE_SET_USED(name);
- return -ret;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright(c) 2012-2013 6WIND S.A.
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <pthread.h>
-#include <errno.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_cycles.h>
-#include <rte_lcore.h>
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_internal_cfg.h"
-
-enum timer_source eal_timer_source = EAL_TIMER_HPET;
-
-#ifdef RTE_LIBEAL_USE_HPET
-
-#define DEV_HPET "/dev/hpet"
-
-/* Maximum number of counters. */
-#define HPET_TIMER_NUM 3
-
-/* General capabilities register */
-#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */
-#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */
-
-/**
- * HPET timer registers. From the Intel IA-PC HPET (High Precision Event
- * Timers) Specification.
- */
-struct eal_hpet_regs {
- /* Memory-mapped, software visible registers */
- uint64_t capabilities; /**< RO General Capabilities Register. */
- uint64_t reserved0; /**< Reserved for future use. */
- uint64_t config; /**< RW General Configuration Register. */
- uint64_t reserved1; /**< Reserved for future use. */
- uint64_t isr; /**< RW Clear General Interrupt Status. */
- uint64_t reserved2[25]; /**< Reserved for future use. */
- union {
- uint64_t counter; /**< RW Main Counter Value Register. */
- struct {
- uint32_t counter_l; /**< RW Main Counter Low. */
- uint32_t counter_h; /**< RW Main Counter High. */
- };
- };
- uint64_t reserved3; /**< Reserved for future use. */
- struct {
- uint64_t config; /**< RW Timer Config and Capability Reg. */
- uint64_t comp; /**< RW Timer Comparator Value Register. */
- uint64_t fsb; /**< RW FSB Interrupt Route Register. */
- uint64_t reserved4; /**< Reserved for future use. */
- } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */
-};
-
-/* Mmap'd hpet registers */
-static volatile struct eal_hpet_regs *eal_hpet = NULL;
-
-/* Period at which the HPET counter increments in
- * femtoseconds (10^-15 seconds). */
-static uint32_t eal_hpet_resolution_fs = 0;
-
-/* Frequency of the HPET counter in Hz */
-static uint64_t eal_hpet_resolution_hz = 0;
-
-/* Incremented 4 times during one 32bits hpet full count */
-static uint32_t eal_hpet_msb;
-
-static pthread_t msb_inc_thread_id;
-
-/*
- * This function runs on a specific thread to update a global variable
- * containing used to process MSB of the HPET (unfortunately, we need
- * this because hpet is 32 bits by default under linux).
- */
-static void *
-hpet_msb_inc(__attribute__((unused)) void *arg)
-{
- uint32_t t;
-
- while (1) {
- t = (eal_hpet->counter_l >> 30);
- if (t != (eal_hpet_msb & 3))
- eal_hpet_msb ++;
- sleep(10);
- }
- return NULL;
-}
-
-uint64_t
-rte_get_hpet_hz(void)
-{
- if(internal_config.no_hpet)
- rte_panic("Error, HPET called, but no HPET present\n");
-
- return eal_hpet_resolution_hz;
-}
-
-uint64_t
-rte_get_hpet_cycles(void)
-{
- uint32_t t, msb;
- uint64_t ret;
-
- if(internal_config.no_hpet)
- rte_panic("Error, HPET called, but no HPET present\n");
-
- t = eal_hpet->counter_l;
- msb = eal_hpet_msb;
- ret = (msb + 2 - (t >> 30)) / 4;
- ret <<= 32;
- ret += t;
- return ret;
-}
-
-#endif
-
-#ifdef RTE_LIBEAL_USE_HPET
-/*
- * Open and mmap /dev/hpet (high precision event timer) that will
- * provide our time reference.
- */
-int
-rte_eal_hpet_init(int make_default)
-{
- int fd, ret;
-
- if (internal_config.no_hpet) {
- RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
- return -1;
- }
-
- fd = open(DEV_HPET, O_RDONLY);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n",
- strerror(errno));
- internal_config.no_hpet = 1;
- return -1;
- }
- eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0);
- if (eal_hpet == MAP_FAILED) {
- RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n"
- "Please enable CONFIG_HPET_MMAP in your kernel configuration "
- "to allow HPET support.\n"
- "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n "
- "in your build configuration or use '--no-hpet' EAL flag.\n");
- close(fd);
- internal_config.no_hpet = 1;
- return -1;
- }
- close(fd);
-
- eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities &
- CLK_PERIOD_MASK) >>
- CLK_PERIOD_SHIFT);
-
- eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) /
- (uint64_t)eal_hpet_resolution_fs;
-
- RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n",
- eal_hpet_resolution_hz/1000);
-
- eal_hpet_msb = (eal_hpet->counter_l >> 30);
-
- /* create a thread that will increment a global variable for
- * msb (hpet is 32 bits by default under linux) */
- ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
- hpet_msb_inc, NULL);
- if (ret != 0) {
- RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
- internal_config.no_hpet = 1;
- return -1;
- }
-
- if (make_default)
- eal_timer_source = EAL_TIMER_HPET;
- return 0;
-}
-#endif
-
-uint64_t
-get_tsc_freq(void)
-{
-#ifdef CLOCK_MONOTONIC_RAW
-#define NS_PER_SEC 1E9
-#define CYC_PER_10MHZ 1E7
-
- struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */
-
- struct timespec t_start, t_end;
- uint64_t tsc_hz;
-
- if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) {
- uint64_t ns, end, start = rte_rdtsc();
- nanosleep(&sleeptime,NULL);
- clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
- end = rte_rdtsc();
- ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC);
- ns += (t_end.tv_nsec - t_start.tv_nsec);
-
- double secs = (double)ns/NS_PER_SEC;
- tsc_hz = (uint64_t)((end - start)/secs);
- /* Round up to 10Mhz. 1E7 ~ 10Mhz */
- return RTE_ALIGN_MUL_NEAR(tsc_hz, CYC_PER_10MHZ);
- }
-#endif
- return 0;
-}
-
-int
-rte_eal_timer_init(void)
-{
-
- eal_timer_source = EAL_TIMER_TSC;
-
- set_tsc_freq();
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <inttypes.h>
-#include <string.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/ioctl.h>
-
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_eal_memconfig.h>
-#include <rte_vfio.h>
-
-#include "eal_filesystem.h"
-#include "eal_memcfg.h"
-#include "eal_vfio.h"
-#include "eal_private.h"
-
-#ifdef VFIO_PRESENT
-
-#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
-
-/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
- * recreate the mappings for DPDK segments, but we cannot do so for memory that
- * was registered by the user themselves, so we need to store the user mappings
- * somewhere, to recreate them later.
- */
-#define VFIO_MAX_USER_MEM_MAPS 256
-struct user_mem_map {
- uint64_t addr;
- uint64_t iova;
- uint64_t len;
-};
-
-struct user_mem_maps {
- rte_spinlock_recursive_t lock;
- int n_maps;
- struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
-};
-
-struct vfio_config {
- int vfio_enabled;
- int vfio_container_fd;
- int vfio_active_groups;
- const struct vfio_iommu_type *vfio_iommu_type;
- struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
- struct user_mem_maps mem_maps;
-};
-
-/* per-process VFIO config */
-static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
-static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
-
-static int vfio_type1_dma_map(int);
-static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_spapr_dma_map(int);
-static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_noiommu_dma_map(int);
-static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
- uint64_t iova, uint64_t len, int do_map);
-
-/* IOMMU types we support */
-static const struct vfio_iommu_type iommu_types[] = {
- /* x86 IOMMU, otherwise known as type 1 */
- {
- .type_id = RTE_VFIO_TYPE1,
- .name = "Type 1",
- .dma_map_func = &vfio_type1_dma_map,
- .dma_user_map_func = &vfio_type1_dma_mem_map
- },
- /* ppc64 IOMMU, otherwise known as spapr */
- {
- .type_id = RTE_VFIO_SPAPR,
- .name = "sPAPR",
- .dma_map_func = &vfio_spapr_dma_map,
- .dma_user_map_func = &vfio_spapr_dma_mem_map
- },
- /* IOMMU-less mode */
- {
- .type_id = RTE_VFIO_NOIOMMU,
- .name = "No-IOMMU",
- .dma_map_func = &vfio_noiommu_dma_map,
- .dma_user_map_func = &vfio_noiommu_dma_mem_map
- },
-};
-
-static int
-is_null_map(const struct user_mem_map *map)
-{
- return map->addr == 0 && map->iova == 0 && map->len == 0;
-}
-
-/* we may need to merge user mem maps together in case of user mapping/unmapping
- * chunks of memory, so we'll need a comparator function to sort segments.
- */
-static int
-user_mem_map_cmp(const void *a, const void *b)
-{
- const struct user_mem_map *umm_a = a;
- const struct user_mem_map *umm_b = b;
-
- /* move null entries to end */
- if (is_null_map(umm_a))
- return 1;
- if (is_null_map(umm_b))
- return -1;
-
- /* sort by iova first */
- if (umm_a->iova < umm_b->iova)
- return -1;
- if (umm_a->iova > umm_b->iova)
- return 1;
-
- if (umm_a->addr < umm_b->addr)
- return -1;
- if (umm_a->addr > umm_b->addr)
- return 1;
-
- if (umm_a->len < umm_b->len)
- return -1;
- if (umm_a->len > umm_b->len)
- return 1;
-
- return 0;
-}
-
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
- */
-static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
- uint64_t remove_va_start, uint64_t remove_len)
-{
- /* if va start is same as start address, we're simply moving start */
- if (remove_va_start == src->addr) {
- src->addr += remove_len;
- src->iova += remove_len;
- src->len -= remove_len;
- } else if (remove_va_start + remove_len == src->addr + src->len) {
- /* we're shrinking mapping from the end */
- src->len -= remove_len;
- } else {
- /* we're blowing a hole in the middle */
- struct user_mem_map tmp;
- uint64_t total_len = src->len;
-
- /* adjust source segment length */
- src->len = remove_va_start - src->addr;
-
- /* create temporary segment in the middle */
- tmp.addr = src->addr + src->len;
- tmp.iova = src->iova + src->len;
- tmp.len = remove_len;
-
- /* populate end segment - this one we will be keeping */
- end->addr = tmp.addr + tmp.len;
- end->iova = tmp.iova + tmp.len;
- end->len = total_len - src->len - tmp.len;
- }
-}
-
-/* try merging two maps into one, return 1 if succeeded */
-static int
-merge_map(struct user_mem_map *left, struct user_mem_map *right)
-{
- if (left->addr + left->len != right->addr)
- return 0;
- if (left->iova + left->len != right->iova)
- return 0;
-
- left->len += right->len;
-
- memset(right, 0, sizeof(*right));
-
- return 1;
-}
-
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
- uint64_t iova, uint64_t len)
-{
- uint64_t va_end = addr + len;
- uint64_t iova_end = iova + len;
- int i;
-
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- struct user_mem_map *map = &user_mem_maps->maps[i];
- uint64_t map_va_end = map->addr + map->len;
- uint64_t map_iova_end = map->iova + map->len;
-
- /* check start VA */
- if (addr < map->addr || addr >= map_va_end)
- continue;
- /* check if VA end is within boundaries */
- if (va_end <= map->addr || va_end > map_va_end)
- continue;
-
- /* check start IOVA */
- if (iova < map->iova || iova >= map_iova_end)
- continue;
- /* check if IOVA end is within boundaries */
- if (iova_end <= map->iova || iova_end > map_iova_end)
- continue;
-
- /* we've found our map */
- return map;
- }
- return NULL;
-}
-
-/* this will sort all user maps, and merge/compact any adjacent maps */
-static void
-compact_user_maps(struct user_mem_maps *user_mem_maps)
-{
- int i, n_merged, cur_idx;
-
- qsort(user_mem_maps->maps, user_mem_maps->n_maps,
- sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
-
- /* we'll go over the list backwards when merging */
- n_merged = 0;
- for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
- struct user_mem_map *l, *r;
-
- l = &user_mem_maps->maps[i];
- r = &user_mem_maps->maps[i + 1];
-
- if (is_null_map(l) || is_null_map(r))
- continue;
-
- if (merge_map(l, r))
- n_merged++;
- }
-
- /* the entries are still sorted, but now they have holes in them, so
- * walk through the list and remove the holes
- */
- if (n_merged > 0) {
- cur_idx = 0;
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- if (!is_null_map(&user_mem_maps->maps[i])) {
- struct user_mem_map *src, *dst;
-
- src = &user_mem_maps->maps[i];
- dst = &user_mem_maps->maps[cur_idx++];
-
- if (src != dst) {
- memcpy(dst, src, sizeof(*src));
- memset(src, 0, sizeof(*src));
- }
- }
- }
- user_mem_maps->n_maps = cur_idx;
- }
-}
-
-static int
-vfio_open_group_fd(int iommu_group_num)
-{
- int vfio_group_fd;
- char filename[PATH_MAX];
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
- /* if primary, try to open the group */
- if (internal_config.process_type == RTE_PROC_PRIMARY) {
- /* try regular group format */
- snprintf(filename, sizeof(filename),
- VFIO_GROUP_FMT, iommu_group_num);
- vfio_group_fd = open(filename, O_RDWR);
- if (vfio_group_fd < 0) {
- /* if file not found, it's not an error */
- if (errno != ENOENT) {
- RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
- strerror(errno));
- return -1;
- }
-
- /* special case: try no-IOMMU path as well */
- snprintf(filename, sizeof(filename),
- VFIO_NOIOMMU_GROUP_FMT,
- iommu_group_num);
- vfio_group_fd = open(filename, O_RDWR);
- if (vfio_group_fd < 0) {
- if (errno != ENOENT) {
- RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
- strerror(errno));
- return -1;
- }
- return 0;
- }
- /* noiommu group found */
- }
-
- return vfio_group_fd;
- }
- /* if we're in a secondary process, request group fd from the primary
- * process via mp channel.
- */
- p->req = SOCKET_REQ_GROUP;
- p->group_num = iommu_group_num;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- vfio_group_fd = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
- vfio_group_fd = mp_rep->fds[0];
- } else if (p->result == SOCKET_NO_FD) {
- RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
- vfio_group_fd = 0;
- }
- }
-
- free(mp_reply.msgs);
- if (vfio_group_fd < 0)
- RTE_LOG(ERR, EAL, " cannot request group fd\n");
- return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_num(int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++) {
- if (vfio_cfg->vfio_groups[j].group_num ==
- iommu_group_num)
- return vfio_cfg;
- }
- }
-
- return NULL;
-}
-
-static int
-vfio_get_group_fd(struct vfio_config *vfio_cfg,
- int iommu_group_num)
-{
- int i;
- int vfio_group_fd;
- struct vfio_group *cur_grp;
-
- /* check if we already have the group descriptor open */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
- return vfio_cfg->vfio_groups[i].fd;
-
- /* Lets see first if there is room for a new group */
- if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- return -1;
- }
-
- /* Now lets get an index for the new group */
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num == -1) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
- return -1;
- }
-
- vfio_group_fd = vfio_open_group_fd(iommu_group_num);
- if (vfio_group_fd < 0) {
- RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
- return -1;
- }
-
- cur_grp->group_num = iommu_group_num;
- cur_grp->fd = vfio_group_fd;
- vfio_cfg->vfio_active_groups++;
-
- return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return vfio_cfg;
- }
-
- return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
- int i;
-
- if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
- return default_vfio_cfg;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == container_fd)
- return &vfio_cfgs[i];
- }
-
- return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-
- return vfio_get_group_fd(vfio_cfg, iommu_group_num);
-}
-
-static int
-get_vfio_group_idx(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i, j;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfg = &vfio_cfgs[i];
- for (j = 0; j < VFIO_MAX_GROUPS; j++)
- if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
- return j;
- }
-
- return -1;
-}
-
-static void
-vfio_group_device_get(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid group fd!\n");
- return;
- }
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
- RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
- else
- vfio_cfg->vfio_groups[i].devices++;
-}
-
-static void
-vfio_group_device_put(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid group fd!\n");
- return;
- }
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
- RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
- else
- vfio_cfg->vfio_groups[i].devices--;
-}
-
-static int
-vfio_group_device_count(int vfio_group_fd)
-{
- struct vfio_config *vfio_cfg;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid group fd!\n");
- return -1;
- }
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
- RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
- return -1;
- }
-
- return vfio_cfg->vfio_groups[i].devices;
-}
-
-static void
-vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
- void *arg __rte_unused)
-{
- rte_iova_t iova_start, iova_expected;
- struct rte_memseg_list *msl;
- struct rte_memseg *ms;
- size_t cur_len = 0;
- uint64_t va_start;
-
- msl = rte_mem_virt2memseg_list(addr);
-
- /* for IOVA as VA mode, no need to care for IOVA addresses */
- if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
- uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
- if (type == RTE_MEM_EVENT_ALLOC)
- vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
- len, 1);
- else
- vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
- len, 0);
- return;
- }
-
-#ifdef RTE_ARCH_PPC_64
- ms = rte_mem_virt2memseg(addr, msl);
- while (cur_len < len) {
- int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
-
- rte_fbarray_set_free(&msl->memseg_arr, idx);
- cur_len += ms->len;
- ++ms;
- }
- cur_len = 0;
-#endif
- /* memsegs are contiguous in memory */
- ms = rte_mem_virt2memseg(addr, msl);
-
- /*
- * This memory is not guaranteed to be contiguous, but it still could
- * be, or it could have some small contiguous chunks. Since the number
- * of VFIO mappings is limited, and VFIO appears to not concatenate
- * adjacent mappings, we have to do this ourselves.
- *
- * So, find contiguous chunks, then map them.
- */
- va_start = ms->addr_64;
- iova_start = iova_expected = ms->iova;
- while (cur_len < len) {
- bool new_contig_area = ms->iova != iova_expected;
- bool last_seg = (len - cur_len) == ms->len;
- bool skip_last = false;
-
- /* only do mappings when current contiguous area ends */
- if (new_contig_area) {
- if (type == RTE_MEM_EVENT_ALLOC)
- vfio_dma_mem_map(default_vfio_cfg, va_start,
- iova_start,
- iova_expected - iova_start, 1);
- else
- vfio_dma_mem_map(default_vfio_cfg, va_start,
- iova_start,
- iova_expected - iova_start, 0);
- va_start = ms->addr_64;
- iova_start = ms->iova;
- }
- /* some memory segments may have invalid IOVA */
- if (ms->iova == RTE_BAD_IOVA) {
- RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
- ms->addr);
- skip_last = true;
- }
- iova_expected = ms->iova + ms->len;
- cur_len += ms->len;
- ++ms;
-
- /*
- * don't count previous segment, and don't attempt to
- * dereference a potentially invalid pointer.
- */
- if (skip_last && !last_seg) {
- iova_expected = iova_start = ms->iova;
- va_start = ms->addr_64;
- } else if (!skip_last && last_seg) {
- /* this is the last segment and we're not skipping */
- if (type == RTE_MEM_EVENT_ALLOC)
- vfio_dma_mem_map(default_vfio_cfg, va_start,
- iova_start,
- iova_expected - iova_start, 1);
- else
- vfio_dma_mem_map(default_vfio_cfg, va_start,
- iova_start,
- iova_expected - iova_start, 0);
- }
- }
-#ifdef RTE_ARCH_PPC_64
- cur_len = 0;
- ms = rte_mem_virt2memseg(addr, msl);
- while (cur_len < len) {
- int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
-
- rte_fbarray_set_used(&msl->memseg_arr, idx);
- cur_len += ms->len;
- ++ms;
- }
-#endif
-}
-
-static int
-vfio_sync_default_container(void)
-{
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
- int iommu_type_id;
- unsigned int i;
-
- /* cannot be called from primary */
- if (rte_eal_process_type() != RTE_PROC_SECONDARY)
- return -1;
-
- /* default container fd should have been opened in rte_vfio_enable() */
- if (!default_vfio_cfg->vfio_enabled ||
- default_vfio_cfg->vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
- return -1;
- }
-
- /* find default container's IOMMU type */
- p->req = SOCKET_REQ_IOMMU_TYPE;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- iommu_type_id = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK)
- iommu_type_id = p->iommu_type_id;
- }
- free(mp_reply.msgs);
- if (iommu_type_id < 0) {
- RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
- return -1;
- }
-
- /* we now have an fd for default container, as well as its IOMMU type.
- * now, set up default VFIO container config to match.
- */
- for (i = 0; i < RTE_DIM(iommu_types); i++) {
- const struct vfio_iommu_type *t = &iommu_types[i];
- if (t->type_id != iommu_type_id)
- continue;
-
- /* we found our IOMMU type */
- default_vfio_cfg->vfio_iommu_type = t;
-
- return 0;
- }
- RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
- iommu_type_id);
- return -1;
-}
-
-int
-rte_vfio_clear_group(int vfio_group_fd)
-{
- int i;
- struct vfio_config *vfio_cfg;
-
- vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid group fd!\n");
- return -1;
- }
-
- i = get_vfio_group_idx(vfio_group_fd);
- if (i < 0)
- return -1;
- vfio_cfg->vfio_groups[i].group_num = -1;
- vfio_cfg->vfio_groups[i].fd = -1;
- vfio_cfg->vfio_groups[i].devices = 0;
- vfio_cfg->vfio_active_groups--;
-
- return 0;
-}
-
-int
-rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
- int *vfio_dev_fd, struct vfio_device_info *device_info)
-{
- struct vfio_group_status group_status = {
- .argsz = sizeof(group_status)
- };
- struct vfio_config *vfio_cfg;
- struct user_mem_maps *user_mem_maps;
- int vfio_container_fd;
- int vfio_group_fd;
- int iommu_group_num;
- int i, ret;
-
- /* get group number */
- ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
- if (ret == 0) {
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
- dev_addr);
- return 1;
- }
-
- /* if negative, something failed */
- if (ret < 0)
- return -1;
-
- /* get the actual group fd */
- vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
- if (vfio_group_fd < 0)
- return -1;
-
- /* if group_fd == 0, that means the device isn't managed by VFIO */
- if (vfio_group_fd == 0) {
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
- dev_addr);
- return 1;
- }
-
- /*
- * at this point, we know that this group is viable (meaning, all devices
- * are either bound to VFIO or not bound to anything)
- */
-
- /* check if the group is viable */
- ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
- if (ret) {
- RTE_LOG(ERR, EAL, " %s cannot get group status, "
- "error %i (%s)\n", dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
- RTE_LOG(ERR, EAL, " %s VFIO group is not viable! "
- "Not all devices in IOMMU group bound to VFIO or unbound\n",
- dev_addr);
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
- vfio_container_fd = vfio_cfg->vfio_container_fd;
- user_mem_maps = &vfio_cfg->mem_maps;
-
- /* check if group does not have a container yet */
- if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
-
- /* add group to a container */
- ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
- &vfio_container_fd);
- if (ret) {
- RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
- "error %i (%s)\n", dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /*
- * pick an IOMMU type and set up DMA mappings for container
- *
- * needs to be done only once, only when first group is
- * assigned to a container and only in primary process.
- * Note this can happen several times with the hotplug
- * functionality.
- */
- if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_cfg->vfio_active_groups == 1 &&
- vfio_group_device_count(vfio_group_fd) == 0) {
- const struct vfio_iommu_type *t;
-
- /* select an IOMMU type which we will be using */
- t = vfio_set_iommu_type(vfio_container_fd);
- if (!t) {
- RTE_LOG(ERR, EAL,
- " %s failed to select IOMMU type\n",
- dev_addr);
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
- /* lock memory hotplug before mapping and release it
- * after registering callback, to prevent races
- */
- rte_mcfg_mem_read_lock();
- if (vfio_cfg == default_vfio_cfg)
- ret = t->dma_map_func(vfio_container_fd);
- else
- ret = 0;
- if (ret) {
- RTE_LOG(ERR, EAL,
- " %s DMA remapping failed, error %i (%s)\n",
- dev_addr, errno, strerror(errno));
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- rte_mcfg_mem_read_unlock();
- return -1;
- }
-
- vfio_cfg->vfio_iommu_type = t;
-
- /* re-map all user-mapped segments */
- rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
- /* this IOMMU type may not support DMA mapping, but
- * if we have mappings in the list - that means we have
- * previously mapped something successfully, so we can
- * be sure that DMA mapping is supported.
- */
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- struct user_mem_map *map;
- map = &user_mem_maps->maps[i];
-
- ret = t->dma_user_map_func(
- vfio_container_fd,
- map->addr, map->iova, map->len,
- 1);
- if (ret) {
- RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
- "va: 0x%" PRIx64 " "
- "iova: 0x%" PRIx64 " "
- "len: 0x%" PRIu64 "\n",
- map->addr, map->iova,
- map->len);
- rte_spinlock_recursive_unlock(
- &user_mem_maps->lock);
- rte_mcfg_mem_read_unlock();
- return -1;
- }
- }
- rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-
- /* register callback for mem events */
- if (vfio_cfg == default_vfio_cfg)
- ret = rte_mem_event_callback_register(
- VFIO_MEM_EVENT_CLB_NAME,
- vfio_mem_event_callback, NULL);
- else
- ret = 0;
- /* unlock memory hotplug */
- rte_mcfg_mem_read_unlock();
-
- if (ret && rte_errno != ENOTSUP) {
- RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
- return -1;
- }
- if (ret)
- RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
- else
- RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
- }
- } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
- vfio_cfg == default_vfio_cfg &&
- vfio_cfg->vfio_iommu_type == NULL) {
- /* if we're not a primary process, we do not set up the VFIO
- * container because it's already been set up by the primary
- * process. instead, we simply ask the primary about VFIO type
- * we are using, and set the VFIO config up appropriately.
- */
- ret = vfio_sync_default_container();
- if (ret < 0) {
- RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
- /* we have successfully initialized VFIO, notify user */
- const struct vfio_iommu_type *t =
- default_vfio_cfg->vfio_iommu_type;
- RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
- t->type_id, t->name);
- }
-
- /* get a file descriptor for the device */
- *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
- if (*vfio_dev_fd < 0) {
- /* if we cannot get a device fd, this implies a problem with
- * the VFIO group or the container not having IOMMU configured.
- */
-
- RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
- dev_addr);
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
-
- /* test and setup the device */
- ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
- if (ret) {
- RTE_LOG(ERR, EAL, " %s cannot get device info, "
- "error %i (%s)\n", dev_addr, errno,
- strerror(errno));
- close(*vfio_dev_fd);
- close(vfio_group_fd);
- rte_vfio_clear_group(vfio_group_fd);
- return -1;
- }
- vfio_group_device_get(vfio_group_fd);
-
- return 0;
-}
-
-int
-rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
- int vfio_dev_fd)
-{
- struct vfio_group_status group_status = {
- .argsz = sizeof(group_status)
- };
- struct vfio_config *vfio_cfg;
- int vfio_group_fd;
- int iommu_group_num;
- int ret;
-
- /* we don't want any DMA mapping messages to come while we're detaching
- * VFIO device, because this might be the last device and we might need
- * to unregister the callback.
- */
- rte_mcfg_mem_read_lock();
-
- /* get group number */
- ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
- if (ret <= 0) {
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
- dev_addr);
- /* This is an error at this point. */
- ret = -1;
- goto out;
- }
-
- /* get the actual group fd */
- vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
- if (vfio_group_fd <= 0) {
- RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
- dev_addr);
- ret = -1;
- goto out;
- }
-
- /* get the vfio_config it belongs to */
- vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
- vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-
- /* At this point we got an active group. Closing it will make the
- * container detachment. If this is the last active group, VFIO kernel
- * code will unset the container and the IOMMU mappings.
- */
-
- /* Closing a device */
- if (close(vfio_dev_fd) < 0) {
- RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
- dev_addr);
- ret = -1;
- goto out;
- }
-
- /* An VFIO group can have several devices attached. Just when there is
- * no devices remaining should the group be closed.
- */
- vfio_group_device_put(vfio_group_fd);
- if (!vfio_group_device_count(vfio_group_fd)) {
-
- if (close(vfio_group_fd) < 0) {
- RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
- dev_addr);
- ret = -1;
- goto out;
- }
-
- if (rte_vfio_clear_group(vfio_group_fd) < 0) {
- RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
- dev_addr);
- ret = -1;
- goto out;
- }
- }
-
- /* if there are no active device groups, unregister the callback to
- * avoid spurious attempts to map/unmap memory from VFIO.
- */
- if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
- rte_eal_process_type() != RTE_PROC_SECONDARY)
- rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
- NULL);
-
- /* success */
- ret = 0;
-
-out:
- rte_mcfg_mem_read_unlock();
- return ret;
-}
-
-int
-rte_vfio_enable(const char *modname)
-{
- /* initialize group list */
- int i, j;
- int vfio_available;
-
- rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
-
- for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
- vfio_cfgs[i].vfio_container_fd = -1;
- vfio_cfgs[i].vfio_active_groups = 0;
- vfio_cfgs[i].vfio_iommu_type = NULL;
- vfio_cfgs[i].mem_maps.lock = lock;
-
- for (j = 0; j < VFIO_MAX_GROUPS; j++) {
- vfio_cfgs[i].vfio_groups[j].fd = -1;
- vfio_cfgs[i].vfio_groups[j].group_num = -1;
- vfio_cfgs[i].vfio_groups[j].devices = 0;
- }
- }
-
- /* inform the user that we are probing for VFIO */
- RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
-
- /* check if vfio module is loaded */
- vfio_available = rte_eal_check_module(modname);
-
- /* return error directly */
- if (vfio_available == -1) {
- RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
- return -1;
- }
-
- /* return 0 if VFIO modules not loaded */
- if (vfio_available == 0) {
- RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
- "skipping VFIO support...\n");
- return 0;
- }
-
- if (internal_config.process_type == RTE_PROC_PRIMARY) {
- /* open a new container */
- default_vfio_cfg->vfio_container_fd =
- rte_vfio_get_container_fd();
- } else {
- /* get the default container from the primary process */
- default_vfio_cfg->vfio_container_fd =
- vfio_get_default_container_fd();
- }
-
- /* check if we have VFIO driver enabled */
- if (default_vfio_cfg->vfio_container_fd != -1) {
- RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
- default_vfio_cfg->vfio_enabled = 1;
- } else {
- RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
- }
-
- return 0;
-}
-
-int
-rte_vfio_is_enabled(const char *modname)
-{
- const int mod_available = rte_eal_check_module(modname) > 0;
- return default_vfio_cfg->vfio_enabled && mod_available;
-}
-
-int
-vfio_get_default_container_fd(void)
-{
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
- if (default_vfio_cfg->vfio_enabled)
- return default_vfio_cfg->vfio_container_fd;
-
- if (internal_config.process_type == RTE_PROC_PRIMARY) {
- /* if we were secondary process we would try requesting
- * container fd from the primary, but we're the primary
- * process so just exit here
- */
- return -1;
- }
-
- p->req = SOCKET_REQ_DEFAULT_CONTAINER;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
- free(mp_reply.msgs);
- return mp_rep->fds[0];
- }
- }
-
- free(mp_reply.msgs);
- RTE_LOG(ERR, EAL, " cannot request default container fd\n");
- return -1;
-}
-
-int
-vfio_get_iommu_type(void)
-{
- if (default_vfio_cfg->vfio_iommu_type == NULL)
- return -1;
-
- return default_vfio_cfg->vfio_iommu_type->type_id;
-}
-
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd)
-{
- unsigned idx;
- for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
- const struct vfio_iommu_type *t = &iommu_types[idx];
-
- int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
- t->type_id);
- if (!ret) {
- RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
- t->type_id, t->name);
- return t;
- }
- /* not an error, there may be more supported IOMMU types */
- RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, "
- "error %i (%s)\n", t->type_id, t->name, errno,
- strerror(errno));
- }
- /* if we didn't find a suitable IOMMU type, fail */
- return NULL;
-}
-
-int
-vfio_has_supported_extensions(int vfio_container_fd)
-{
- int ret;
- unsigned idx, n_extensions = 0;
- for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
- const struct vfio_iommu_type *t = &iommu_types[idx];
-
- ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
- t->type_id);
- if (ret < 0) {
- RTE_LOG(ERR, EAL, " could not get IOMMU type, "
- "error %i (%s)\n", errno,
- strerror(errno));
- close(vfio_container_fd);
- return -1;
- } else if (ret == 1) {
- /* we found a supported extension */
- n_extensions++;
- }
- RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n",
- t->type_id, t->name,
- ret ? "supported" : "not supported");
- }
-
- /* if we didn't find any supported IOMMU types, fail */
- if (!n_extensions) {
- close(vfio_container_fd);
- return -1;
- }
-
- return 0;
-}
-
-int
-rte_vfio_get_container_fd(void)
-{
- int ret, vfio_container_fd;
- struct rte_mp_msg mp_req, *mp_rep;
- struct rte_mp_reply mp_reply = {0};
- struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
- struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
-
- /* if we're in a primary process, try to open the container */
- if (internal_config.process_type == RTE_PROC_PRIMARY) {
- vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
- if (vfio_container_fd < 0) {
- RTE_LOG(ERR, EAL, " cannot open VFIO container, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- /* check VFIO API version */
- ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
- if (ret != VFIO_API_VERSION) {
- if (ret < 0)
- RTE_LOG(ERR, EAL, " could not get VFIO API version, "
- "error %i (%s)\n", errno, strerror(errno));
- else
- RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n");
- close(vfio_container_fd);
- return -1;
- }
-
- ret = vfio_has_supported_extensions(vfio_container_fd);
- if (ret) {
- RTE_LOG(ERR, EAL, " no supported IOMMU "
- "extensions found!\n");
- return -1;
- }
-
- return vfio_container_fd;
- }
- /*
- * if we're in a secondary process, request container fd from the
- * primary process via mp channel
- */
- p->req = SOCKET_REQ_CONTAINER;
- strcpy(mp_req.name, EAL_VFIO_MP);
- mp_req.len_param = sizeof(*p);
- mp_req.num_fds = 0;
-
- vfio_container_fd = -1;
- if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
- mp_reply.nb_received == 1) {
- mp_rep = &mp_reply.msgs[0];
- p = (struct vfio_mp_param *)mp_rep->param;
- if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
- vfio_container_fd = mp_rep->fds[0];
- free(mp_reply.msgs);
- return vfio_container_fd;
- }
- }
-
- free(mp_reply.msgs);
- RTE_LOG(ERR, EAL, " cannot request container fd\n");
- return -1;
-}
-
-int
-rte_vfio_get_group_num(const char *sysfs_base,
- const char *dev_addr, int *iommu_group_num)
-{
- char linkname[PATH_MAX];
- char filename[PATH_MAX];
- char *tok[16], *group_tok, *end;
- int ret;
-
- memset(linkname, 0, sizeof(linkname));
- memset(filename, 0, sizeof(filename));
-
- /* try to find out IOMMU group for this device */
- snprintf(linkname, sizeof(linkname),
- "%s/%s/iommu_group", sysfs_base, dev_addr);
-
- ret = readlink(linkname, filename, sizeof(filename));
-
- /* if the link doesn't exist, no VFIO for us */
- if (ret < 0)
- return 0;
-
- ret = rte_strsplit(filename, sizeof(filename),
- tok, RTE_DIM(tok), '/');
-
- if (ret <= 0) {
- RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr);
- return -1;
- }
-
- /* IOMMU group is always the last token */
- errno = 0;
- group_tok = tok[ret - 1];
- end = group_tok;
- *iommu_group_num = strtol(group_tok, &end, 10);
- if ((end != group_tok && *end != '\0') || errno != 0) {
- RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr);
- return -1;
- }
-
- return 1;
-}
-
-static int
-type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
- size_t len, void *arg)
-{
- int *vfio_container_fd = arg;
-
- if (msl->external)
- return 0;
-
- return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
- len, 1);
-}
-
-static int
-type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
- void *arg)
-{
- int *vfio_container_fd = arg;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- /* if IOVA mode is VA, we've already mapped the internal segments */
- if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
- return 0;
-
- return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
- ms->len, 1);
-}
-
-static int
-vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- struct vfio_iommu_type1_dma_map dma_map;
- struct vfio_iommu_type1_dma_unmap dma_unmap;
- int ret;
-
- if (do_map != 0) {
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = vaddr;
- dma_map.size = len;
- dma_map.iova = iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
- if (ret) {
- /**
- * In case the mapping was already done EEXIST will be
- * returned from kernel.
- */
- if (errno == EEXIST) {
- RTE_LOG(DEBUG, EAL,
- " Memory segment is already mapped,"
- " skipping");
- } else {
- RTE_LOG(ERR, EAL,
- " cannot set up DMA remapping,"
- " error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
- }
- } else {
- memset(&dma_unmap, 0, sizeof(dma_unmap));
- dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
- dma_unmap.size = len;
- dma_unmap.iova = iova;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
- &dma_unmap);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
- }
-
- return 0;
-}
-
-static int
-vfio_type1_dma_map(int vfio_container_fd)
-{
- if (rte_eal_iova_mode() == RTE_IOVA_VA) {
- /* with IOVA as VA mode, we can get away with mapping contiguous
- * chunks rather than going page-by-page.
- */
- int ret = rte_memseg_contig_walk(type1_map_contig,
- &vfio_container_fd);
- if (ret)
- return ret;
- /* we have to continue the walk because we've skipped the
- * external segments during the config walk.
- */
- }
- return rte_memseg_walk(type1_map, &vfio_container_fd);
-}
-
-static int
-vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- struct vfio_iommu_type1_dma_map dma_map;
- struct vfio_iommu_type1_dma_unmap dma_unmap;
- int ret;
- struct vfio_iommu_spapr_register_memory reg = {
- .argsz = sizeof(reg),
- .flags = 0
- };
- reg.vaddr = (uintptr_t) vaddr;
- reg.size = len;
-
- if (do_map != 0) {
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = vaddr;
- dma_map.size = len;
- dma_map.iova = iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
- if (ret) {
- /**
- * In case the mapping was already done EBUSY will be
- * returned from kernel.
- */
- if (errno == EBUSY) {
- RTE_LOG(DEBUG, EAL,
- " Memory segment is already mapped,"
- " skipping");
- } else {
- RTE_LOG(ERR, EAL,
- " cannot set up DMA remapping,"
- " error %i (%s)\n", errno,
- strerror(errno));
- return -1;
- }
- }
-
- } else {
- memset(&dma_unmap, 0, sizeof(dma_unmap));
- dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
- dma_unmap.size = len;
- dma_unmap.iova = iova;
-
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
- &dma_unmap);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
-
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
- }
-
- return 0;
-}
-
-static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl,
- const struct rte_memseg *ms, void *arg)
-{
- int *vfio_container_fd = arg;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
- ms->len, 1);
-}
-
-static int
-vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
- const struct rte_memseg *ms, void *arg)
-{
- int *vfio_container_fd = arg;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
- ms->len, 0);
-}
-
-struct spapr_walk_param {
- uint64_t window_size;
- uint64_t hugepage_sz;
-};
-
-static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
- const struct rte_memseg *ms, void *arg)
-{
- struct spapr_walk_param *param = arg;
- uint64_t max = ms->iova + ms->len;
-
- /* skip external memory that isn't a heap */
- if (msl->external && !msl->heap)
- return 0;
-
- /* skip any segments with invalid IOVA addresses */
- if (ms->iova == RTE_BAD_IOVA)
- return 0;
-
- if (max > param->window_size) {
- param->hugepage_sz = ms->hugepage_sz;
- param->window_size = max;
- }
-
- return 0;
-}
-
-static int
-vfio_spapr_create_new_dma_window(int vfio_container_fd,
- struct vfio_iommu_spapr_tce_create *create) {
- struct vfio_iommu_spapr_tce_remove remove = {
- .argsz = sizeof(remove),
- };
- struct vfio_iommu_spapr_tce_info info = {
- .argsz = sizeof(info),
- };
- int ret;
-
- /* query spapr iommu info */
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot get iommu info, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- /* remove default DMA of 32 bit window */
- remove.start_addr = info.dma32_window_start;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- /* create new DMA window */
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
- if (ret) {
-#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
- /* try possible page_shift and levels for workaround */
- uint32_t levels;
-
- for (levels = create->levels + 1;
- ret && levels <= info.ddw.levels; levels++) {
- create->levels = levels;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_TCE_CREATE, create);
- }
-#endif
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot create new DMA window, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
- }
-
- if (create->start_addr != 0) {
- RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
- return -1;
- }
-
- return 0;
-}
-
-static int
-vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- struct spapr_walk_param param;
- struct vfio_iommu_spapr_tce_create create = {
- .argsz = sizeof(create),
- };
- struct vfio_config *vfio_cfg;
- struct user_mem_maps *user_mem_maps;
- int i, ret = 0;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, " invalid container fd!\n");
- return -1;
- }
-
- user_mem_maps = &vfio_cfg->mem_maps;
- rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
- /* check if window size needs to be adjusted */
- memset(¶m, 0, sizeof(param));
-
- /* we're inside a callback so use thread-unsafe version */
- if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
- ¶m) < 0) {
- RTE_LOG(ERR, EAL, "Could not get window size\n");
- ret = -1;
- goto out;
- }
-
- /* also check user maps */
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- uint64_t max = user_mem_maps->maps[i].iova +
- user_mem_maps->maps[i].len;
- param.window_size = RTE_MAX(param.window_size, max);
- }
-
- /* sPAPR requires window size to be a power of 2 */
- create.window_size = rte_align64pow2(param.window_size);
- create.page_shift = __builtin_ctzll(param.hugepage_sz);
- create.levels = 1;
-
- if (do_map) {
- /* re-create window and remap the entire memory */
- if (iova + len > create.window_size) {
- /* release all maps before recreating the window */
- if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
- &vfio_container_fd) < 0) {
- RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
- ret = -1;
- goto out;
- }
- /* release all user maps */
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- struct user_mem_map *map =
- &user_mem_maps->maps[i];
- if (vfio_spapr_dma_do_map(vfio_container_fd,
- map->addr, map->iova, map->len,
- 0)) {
- RTE_LOG(ERR, EAL, "Could not release user DMA maps\n");
- ret = -1;
- goto out;
- }
- }
- create.window_size = rte_align64pow2(iova + len);
- if (vfio_spapr_create_new_dma_window(vfio_container_fd,
- &create) < 0) {
- RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
- ret = -1;
- goto out;
- }
- /* we're inside a callback, so use thread-unsafe version
- */
- if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
- &vfio_container_fd) < 0) {
- RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
- ret = -1;
- goto out;
- }
- /* remap all user maps */
- for (i = 0; i < user_mem_maps->n_maps; i++) {
- struct user_mem_map *map =
- &user_mem_maps->maps[i];
- if (vfio_spapr_dma_do_map(vfio_container_fd,
- map->addr, map->iova, map->len,
- 1)) {
- RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
- ret = -1;
- goto out;
- }
- }
- }
- if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
- RTE_LOG(ERR, EAL, "Failed to map DMA\n");
- ret = -1;
- goto out;
- }
- } else {
- /* for unmap, check if iova within DMA window */
- if (iova > create.window_size) {
- RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
- ret = -1;
- goto out;
- }
-
- vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
- }
-out:
- rte_spinlock_recursive_unlock(&user_mem_maps->lock);
- return ret;
-}
-
-static int
-vfio_spapr_dma_map(int vfio_container_fd)
-{
- struct vfio_iommu_spapr_tce_create create = {
- .argsz = sizeof(create),
- };
- struct spapr_walk_param param;
-
- memset(¶m, 0, sizeof(param));
-
- /* create DMA window from 0 to max(phys_addr + len) */
- rte_memseg_walk(vfio_spapr_window_size_walk, ¶m);
-
- /* sPAPR requires window size to be a power of 2 */
- create.window_size = rte_align64pow2(param.window_size);
- create.page_shift = __builtin_ctzll(param.hugepage_sz);
- create.levels = 1;
-
- if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
- RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
- return -1;
- }
-
- /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
- return -1;
-
- return 0;
-}
-
-static int
-vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
-{
- /* No-IOMMU mode does not need DMA mapping */
- return 0;
-}
-
-static int
-vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
- uint64_t __rte_unused vaddr,
- uint64_t __rte_unused iova, uint64_t __rte_unused len,
- int __rte_unused do_map)
-{
- /* No-IOMMU mode does not need DMA mapping */
- return 0;
-}
-
-static int
-vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map)
-{
- const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
-
- if (!t) {
- RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
- rte_errno = ENODEV;
- return -1;
- }
-
- if (!t->dma_user_map_func) {
- RTE_LOG(ERR, EAL,
- " VFIO custom DMA region maping not supported by IOMMU %s\n",
- t->name);
- rte_errno = ENOTSUP;
- return -1;
- }
-
- return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
- len, do_map);
-}
-
-static int
-container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
- uint64_t len)
-{
- struct user_mem_map *new_map;
- struct user_mem_maps *user_mem_maps;
- int ret = 0;
-
- user_mem_maps = &vfio_cfg->mem_maps;
- rte_spinlock_recursive_lock(&user_mem_maps->lock);
- if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
- RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
- rte_errno = ENOMEM;
- ret = -1;
- goto out;
- }
- /* map the entry */
- if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
- /* technically, this will fail if there are currently no devices
- * plugged in, even if a device were added later, this mapping
- * might have succeeded. however, since we cannot verify if this
- * is a valid mapping without having a device attached, consider
- * this to be unsupported, because we can't just store any old
- * mapping and pollute list of active mappings willy-nilly.
- */
- RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
- ret = -1;
- goto out;
- }
- /* create new user mem map entry */
- new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
- new_map->addr = vaddr;
- new_map->iova = iova;
- new_map->len = len;
-
- compact_user_maps(user_mem_maps);
-out:
- rte_spinlock_recursive_unlock(&user_mem_maps->lock);
- return ret;
-}
-
-static int
-container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
- uint64_t len)
-{
- struct user_mem_map *map, *new_map = NULL;
- struct user_mem_maps *user_mem_maps;
- int ret = 0;
-
- user_mem_maps = &vfio_cfg->mem_maps;
- rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
- /* find our mapping */
- map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
- if (!map) {
- RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
- rte_errno = EINVAL;
- ret = -1;
- goto out;
- }
- if (map->addr != vaddr || map->iova != iova || map->len != len) {
- /* we're partially unmapping a previously mapped region, so we
- * need to split entry into two.
- */
- if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
- RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
- rte_errno = ENOMEM;
- ret = -1;
- goto out;
- }
- new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
- }
-
- /* unmap the entry */
- if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
- /* there may not be any devices plugged in, so unmapping will
- * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
- * stop us from removing the mapping, as the assumption is we
- * won't be needing this memory any more and thus will want to
- * prevent it from being remapped again on hotplug. so, only
- * fail if we indeed failed to unmap (e.g. if the mapping was
- * within our mapped range but had invalid alignment).
- */
- if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
- RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
- ret = -1;
- goto out;
- } else {
- RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
- }
- }
- /* remove map from the list of active mappings */
- if (new_map != NULL) {
- adjust_map(map, new_map, vaddr, len);
-
- /* if we've created a new map by splitting, sort everything */
- if (!is_null_map(new_map)) {
- compact_user_maps(user_mem_maps);
- } else {
- /* we've created a new mapping, but it was unused */
- user_mem_maps->n_maps--;
- }
- } else {
- memset(map, 0, sizeof(*map));
- compact_user_maps(user_mem_maps);
- user_mem_maps->n_maps--;
- }
-
-out:
- rte_spinlock_recursive_unlock(&user_mem_maps->lock);
- return ret;
-}
-
-int
-rte_vfio_noiommu_is_enabled(void)
-{
- int fd;
- ssize_t cnt;
- char c;
-
- fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
- if (fd < 0) {
- if (errno != ENOENT) {
- RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n",
- errno, strerror(errno));
- return -1;
- }
- /*
- * else the file does not exists
- * i.e. noiommu is not enabled
- */
- return 0;
- }
-
- cnt = read(fd, &c, 1);
- close(fd);
- if (cnt != 1) {
- RTE_LOG(ERR, EAL, " unable to read from vfio noiommu "
- "file %i (%s)\n", errno, strerror(errno));
- return -1;
- }
-
- return c == 'Y';
-}
-
-int
-rte_vfio_container_create(void)
-{
- int i;
-
- /* Find an empty slot to store new vfio config */
- for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
- if (vfio_cfgs[i].vfio_container_fd == -1)
- break;
- }
-
- if (i == VFIO_MAX_CONTAINERS) {
- RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
- return -1;
- }
-
- vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
- if (vfio_cfgs[i].vfio_container_fd < 0) {
- RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
- return -1;
- }
-
- return vfio_cfgs[i].vfio_container_fd;
-}
-
-int
-rte_vfio_container_destroy(int container_fd)
-{
- struct vfio_config *vfio_cfg;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, "Invalid container fd\n");
- return -1;
- }
-
- for (i = 0; i < VFIO_MAX_GROUPS; i++)
- if (vfio_cfg->vfio_groups[i].group_num != -1)
- rte_vfio_container_group_unbind(container_fd,
- vfio_cfg->vfio_groups[i].group_num);
-
- close(container_fd);
- vfio_cfg->vfio_container_fd = -1;
- vfio_cfg->vfio_active_groups = 0;
- vfio_cfg->vfio_iommu_type = NULL;
-
- return 0;
-}
-
-int
-rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, "Invalid container fd\n");
- return -1;
- }
-
- return vfio_get_group_fd(vfio_cfg, iommu_group_num);
-}
-
-int
-rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
-{
- struct vfio_config *vfio_cfg;
- struct vfio_group *cur_grp = NULL;
- int i;
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, "Invalid container fd\n");
- return -1;
- }
-
- for (i = 0; i < VFIO_MAX_GROUPS; i++) {
- if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
- cur_grp = &vfio_cfg->vfio_groups[i];
- break;
- }
- }
-
- /* This should not happen */
- if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
- RTE_LOG(ERR, EAL, "Specified group number not found\n");
- return -1;
- }
-
- if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
- RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
- " iommu_group_num %d\n", iommu_group_num);
- return -1;
- }
- cur_grp->group_num = -1;
- cur_grp->fd = -1;
- cur_grp->devices = 0;
- vfio_cfg->vfio_active_groups--;
-
- return 0;
-}
-
-int
-rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len)
-{
- struct vfio_config *vfio_cfg;
-
- if (len == 0) {
- rte_errno = EINVAL;
- return -1;
- }
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, "Invalid container fd\n");
- return -1;
- }
-
- return container_dma_map(vfio_cfg, vaddr, iova, len);
-}
-
-int
-rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
- uint64_t len)
-{
- struct vfio_config *vfio_cfg;
-
- if (len == 0) {
- rte_errno = EINVAL;
- return -1;
- }
-
- vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
- if (vfio_cfg == NULL) {
- RTE_LOG(ERR, EAL, "Invalid container fd\n");
- return -1;
- }
-
- return container_dma_unmap(vfio_cfg, vaddr, iova, len);
-}
-
-#else
-
-int
-rte_vfio_setup_device(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr,
- __rte_unused int *vfio_dev_fd,
- __rte_unused struct vfio_device_info *device_info)
-{
- return -1;
-}
-
-int
-rte_vfio_release_device(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr, __rte_unused int fd)
-{
- return -1;
-}
-
-int
-rte_vfio_enable(__rte_unused const char *modname)
-{
- return -1;
-}
-
-int
-rte_vfio_is_enabled(__rte_unused const char *modname)
-{
- return -1;
-}
-
-int
-rte_vfio_noiommu_is_enabled(void)
-{
- return -1;
-}
-
-int
-rte_vfio_clear_group(__rte_unused int vfio_group_fd)
-{
- return -1;
-}
-
-int
-rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
- __rte_unused const char *dev_addr,
- __rte_unused int *iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_get_container_fd(void)
-{
- return -1;
-}
-
-int
-rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_create(void)
-{
- return -1;
-}
-
-int
-rte_vfio_container_destroy(__rte_unused int container_fd)
-{
- return -1;
-}
-
-int
-rte_vfio_container_group_bind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_group_unbind(__rte_unused int container_fd,
- __rte_unused int iommu_group_num)
-{
- return -1;
-}
-
-int
-rte_vfio_container_dma_map(__rte_unused int container_fd,
- __rte_unused uint64_t vaddr,
- __rte_unused uint64_t iova,
- __rte_unused uint64_t len)
-{
- return -1;
-}
-
-int
-rte_vfio_container_dma_unmap(__rte_unused int container_fd,
- __rte_unused uint64_t vaddr,
- __rte_unused uint64_t iova,
- __rte_unused uint64_t len)
-{
- return -1;
-}
-
-#endif /* VFIO_PRESENT */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#ifndef EAL_VFIO_H_
-#define EAL_VFIO_H_
-
-#include <rte_common.h>
-
-/*
- * determine if VFIO is present on the system
- */
-#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO)
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
-#define VFIO_PRESENT
-#else
-#pragma message("VFIO configured but not supported by this kernel, disabling.")
-#endif /* kernel version >= 3.6.0 */
-#endif /* RTE_EAL_VFIO */
-
-#ifdef VFIO_PRESENT
-
-#include <stdint.h>
-#include <linux/vfio.h>
-
-#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
-
-#ifndef VFIO_SPAPR_TCE_v2_IOMMU
-#define RTE_VFIO_SPAPR 7
-#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
-#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
-#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
-#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
-
-struct vfio_iommu_spapr_register_memory {
- uint32_t argsz;
- uint32_t flags;
- uint64_t vaddr;
- uint64_t size;
-};
-
-struct vfio_iommu_spapr_tce_create {
- uint32_t argsz;
- uint32_t flags;
- /* in */
- uint32_t page_shift;
- uint32_t __resv1;
- uint64_t window_size;
- uint32_t levels;
- uint32_t __resv2;
- /* out */
- uint64_t start_addr;
-};
-
-struct vfio_iommu_spapr_tce_remove {
- uint32_t argsz;
- uint32_t flags;
- /* in */
- uint64_t start_addr;
-};
-
-struct vfio_iommu_spapr_tce_ddw_info {
- uint64_t pgsizes;
- uint32_t max_dynamic_windows_supported;
- uint32_t levels;
-};
-
-/* SPAPR_v2 is not present, but SPAPR might be */
-#ifndef VFIO_SPAPR_TCE_IOMMU
-#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
-
-struct vfio_iommu_spapr_tce_info {
- uint32_t argsz;
- uint32_t flags;
- uint32_t dma32_window_start;
- uint32_t dma32_window_size;
- struct vfio_iommu_spapr_tce_ddw_info ddw;
-};
-#endif /* VFIO_SPAPR_TCE_IOMMU */
-
-#else /* VFIO_SPAPR_TCE_v2_IOMMU */
-#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
-#endif
-
-#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
-#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
-
-/*
- * we don't need to store device fd's anywhere since they can be obtained from
- * the group fd via an ioctl() call.
- */
-struct vfio_group {
- int group_num;
- int fd;
- int devices;
-};
-
-/* DMA mapping function prototype.
- * Takes VFIO container fd as a parameter.
- * Returns 0 on success, -1 on error.
- * */
-typedef int (*vfio_dma_func_t)(int);
-
-/* Custom memory region DMA mapping function prototype.
- * Takes VFIO container fd, virtual address, phisical address, length and
- * operation type (0 to unmap 1 for map) as a parameters.
- * Returns 0 on success, -1 on error.
- **/
-typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
- uint64_t len, int do_map);
-
-struct vfio_iommu_type {
- int type_id;
- const char *name;
- vfio_dma_user_func_t dma_user_map_func;
- vfio_dma_func_t dma_map_func;
-};
-
-/* get the vfio container that devices are bound to by default */
-int vfio_get_default_container_fd(void);
-
-/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd);
-
-int
-vfio_get_iommu_type(void);
-
-/* check if we have any supported extensions */
-int
-vfio_has_supported_extensions(int vfio_container_fd);
-
-int vfio_mp_sync_setup(void);
-
-#define EAL_VFIO_MP "eal_vfio_mp_sync"
-
-#define SOCKET_REQ_CONTAINER 0x100
-#define SOCKET_REQ_GROUP 0x200
-#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
-#define SOCKET_REQ_IOMMU_TYPE 0x800
-#define SOCKET_OK 0x0
-#define SOCKET_NO_FD 0x1
-#define SOCKET_ERR 0xFF
-
-struct vfio_mp_param {
- int req;
- int result;
- RTE_STD_C11
- union {
- int group_num;
- int iommu_type_id;
- };
-};
-
-#endif /* VFIO_PRESENT */
-
-#endif /* EAL_VFIO_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <unistd.h>
-#include <string.h>
-
-#include <rte_compat.h>
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_vfio.h>
-#include <rte_eal.h>
-
-#include "eal_vfio.h"
-
-/**
- * @file
- * VFIO socket for communication between primary and secondary processes.
- *
- * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
- */
-
-#ifdef VFIO_PRESENT
-
-static int
-vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
-{
- int fd = -1;
- int ret;
- struct rte_mp_msg reply;
- struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
- const struct vfio_mp_param *m =
- (const struct vfio_mp_param *)msg->param;
-
- if (msg->len_param != sizeof(*m)) {
- RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
- return -1;
- }
-
- memset(&reply, 0, sizeof(reply));
-
- switch (m->req) {
- case SOCKET_REQ_GROUP:
- r->req = SOCKET_REQ_GROUP;
- r->group_num = m->group_num;
- fd = rte_vfio_get_group_fd(m->group_num);
- if (fd < 0)
- r->result = SOCKET_ERR;
- else if (fd == 0)
- /* if VFIO group exists but isn't bound to VFIO driver */
- r->result = SOCKET_NO_FD;
- else {
- /* if group exists and is bound to VFIO driver */
- r->result = SOCKET_OK;
- reply.num_fds = 1;
- reply.fds[0] = fd;
- }
- break;
- case SOCKET_REQ_CONTAINER:
- r->req = SOCKET_REQ_CONTAINER;
- fd = rte_vfio_get_container_fd();
- if (fd < 0)
- r->result = SOCKET_ERR;
- else {
- r->result = SOCKET_OK;
- reply.num_fds = 1;
- reply.fds[0] = fd;
- }
- break;
- case SOCKET_REQ_DEFAULT_CONTAINER:
- r->req = SOCKET_REQ_DEFAULT_CONTAINER;
- fd = vfio_get_default_container_fd();
- if (fd < 0)
- r->result = SOCKET_ERR;
- else {
- r->result = SOCKET_OK;
- reply.num_fds = 1;
- reply.fds[0] = fd;
- }
- break;
- case SOCKET_REQ_IOMMU_TYPE:
- {
- int iommu_type_id;
-
- r->req = SOCKET_REQ_IOMMU_TYPE;
-
- iommu_type_id = vfio_get_iommu_type();
-
- if (iommu_type_id < 0)
- r->result = SOCKET_ERR;
- else {
- r->iommu_type_id = iommu_type_id;
- r->result = SOCKET_OK;
- }
- break;
- }
- default:
- RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
- return -1;
- }
-
- strcpy(reply.name, EAL_VFIO_MP);
- reply.len_param = sizeof(*r);
-
- ret = rte_mp_reply(&reply, peer);
- if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
- close(fd);
- return ret;
-}
-
-int
-vfio_mp_sync_setup(void)
-{
- if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
- int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
- if (ret && rte_errno != ENOTSUP)
- return -1;
- }
-
- return 0;
-}
-
-#endif
+++ /dev/null
-/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */
-/*
- * Copyright(c) 2007-2014 Intel Corporation.
- */
-
-#ifndef _RTE_KNI_COMMON_H_
-#define _RTE_KNI_COMMON_H_
-
-#ifdef __KERNEL__
-#include <linux/if.h>
-#include <asm/barrier.h>
-#define RTE_STD_C11
-#else
-#include <rte_common.h>
-#include <rte_config.h>
-#endif
-
-/*
- * KNI name is part of memzone name. Must not exceed IFNAMSIZ.
- */
-#define RTE_KNI_NAMESIZE 16
-
-#define RTE_CACHE_LINE_MIN_SIZE 64
-
-/*
- * Request id.
- */
-enum rte_kni_req_id {
- RTE_KNI_REQ_UNKNOWN = 0,
- RTE_KNI_REQ_CHANGE_MTU,
- RTE_KNI_REQ_CFG_NETWORK_IF,
- RTE_KNI_REQ_CHANGE_MAC_ADDR,
- RTE_KNI_REQ_CHANGE_PROMISC,
- RTE_KNI_REQ_CHANGE_ALLMULTI,
- RTE_KNI_REQ_MAX,
-};
-
-/*
- * Structure for KNI request.
- */
-struct rte_kni_request {
- uint32_t req_id; /**< Request id */
- RTE_STD_C11
- union {
- uint32_t new_mtu; /**< New MTU */
- uint8_t if_up; /**< 1: interface up, 0: interface down */
- uint8_t mac_addr[6]; /**< MAC address for interface */
- uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */
- uint8_t allmulti; /**< 1: all-multicast mode enable, 0: disable */
- };
- int32_t result; /**< Result for processing request */
-} __attribute__((__packed__));
-
-/*
- * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
- * Write and read should wrap around. Fifo is empty when write == read
- * Writing should never overwrite the read position
- */
-struct rte_kni_fifo {
-#ifdef RTE_USE_C11_MEM_MODEL
- unsigned write; /**< Next position to be written*/
- unsigned read; /**< Next position to be read */
-#else
- volatile unsigned write; /**< Next position to be written*/
- volatile unsigned read; /**< Next position to be read */
-#endif
- unsigned len; /**< Circular buffer length */
- unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
- void *volatile buffer[]; /**< The buffer contains mbuf pointers */
-};
-
-/*
- * The kernel image of the rte_mbuf struct, with only the relevant fields.
- * Padding is necessary to assure the offsets of these fields
- */
-struct rte_kni_mbuf {
- void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
- uint64_t buf_physaddr;
- uint16_t data_off; /**< Start address of data in segment buffer. */
- char pad1[2];
- uint16_t nb_segs; /**< Number of segments. */
- char pad4[2];
- uint64_t ol_flags; /**< Offload features. */
- char pad2[4];
- uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
- uint16_t data_len; /**< Amount of data in segment buffer. */
-
- /* fields on second cache line */
- char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)));
- void *pool;
- void *next; /**< Physical address of next mbuf in kernel. */
-};
-
-/*
- * Struct used to create a KNI device. Passed to the kernel in IOCTL call
- */
-
-struct rte_kni_device_info {
- char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */
-
- phys_addr_t tx_phys;
- phys_addr_t rx_phys;
- phys_addr_t alloc_phys;
- phys_addr_t free_phys;
-
- /* Used by Ethtool */
- phys_addr_t req_phys;
- phys_addr_t resp_phys;
- phys_addr_t sync_phys;
- void * sync_va;
-
- /* mbuf mempool */
- void * mbuf_va;
- phys_addr_t mbuf_phys;
-
- uint16_t group_id; /**< Group ID */
- uint32_t core_id; /**< core ID to bind for kernel thread */
-
- __extension__
- uint8_t force_bind : 1; /**< Flag for kernel thread binding */
-
- /* mbuf size */
- unsigned mbuf_size;
- unsigned int mtu;
- unsigned int min_mtu;
- unsigned int max_mtu;
- uint8_t mac_addr[6];
- uint8_t iova_mode;
-};
-
-#define KNI_DEVICE "kni"
-
-#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int)
-#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info)
-#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info)
-
-#endif /* _RTE_KNI_COMMON_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2019 Intel Corporation
- */
-
-#ifndef _RTE_OS_H_
-#define _RTE_OS_H_
-
-/**
- * This is header should contain any function/macro definition
- * which are not supported natively or named differently in the
- * linux OS. Functions will be added in future releases.
- */
-
-#include <sched.h>
-
-typedef cpu_set_t rte_cpuset_t;
-#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
-#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
-#define RTE_CPU_FILL(set) do \
-{ \
- unsigned int i; \
- CPU_ZERO(set); \
- for (i = 0; i < CPU_SETSIZE; i++) \
- CPU_SET(i, set); \
-} while (0)
-#define RTE_CPU_NOT(dst, src) do \
-{ \
- cpu_set_t tmp; \
- RTE_CPU_FILL(&tmp); \
- CPU_XOR(dst, &tmp, src); \
-} while (0)
-
-#endif /* _RTE_OS_H_ */
+++ /dev/null
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017 Intel Corporation
-
-eal_inc += include_directories('include')
-
-env_objs = []
-env_headers = files(
- 'include/rte_kni_common.h',
- 'include/rte_os.h',
-)
-env_sources = files('eal_alarm.c',
- 'eal_cpuflags.c',
- 'eal_debug.c',
- 'eal_hugepage_info.c',
- 'eal_interrupts.c',
- 'eal_memalloc.c',
- 'eal_lcore.c',
- 'eal_log.c',
- 'eal_thread.c',
- 'eal_timer.c',
- 'eal_vfio.c',
- 'eal_vfio_mp_sync.c',
- 'eal.c',
- 'eal_memory.c',
- 'eal_dev.c',
-)
-
-deps += ['kvargs']
-if has_libnuma == 1
- dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)
-endif
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+#include <sys/timerfd.h>
+
+#include <rte_memory.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_common.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <eal_private.h>
+
+#ifndef TFD_NONBLOCK
+#include <fcntl.h>
+#define TFD_NONBLOCK O_NONBLOCK
+#endif
+
+#define NS_PER_US 1000
+#define US_PER_MS 1000
+#define MS_PER_S 1000
+#ifndef US_PER_S
+#define US_PER_S (US_PER_MS * MS_PER_S)
+#endif
+
+#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
+#else
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC
+#endif
+
+struct alarm_entry {
+ LIST_ENTRY(alarm_entry) next;
+ struct timeval time;
+ rte_eal_alarm_callback cb_fn;
+ void *cb_arg;
+ volatile uint8_t executing;
+ volatile pthread_t executing_id;
+};
+
+static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
+static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static int handler_registered = 0;
+static void eal_alarm_callback(void *arg);
+
+int
+rte_eal_alarm_init(void)
+{
+ intr_handle.type = RTE_INTR_HANDLE_ALARM;
+ /* create a timerfd file descriptor */
+ intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+ if (intr_handle.fd == -1)
+ goto error;
+
+ return 0;
+
+error:
+ rte_errno = errno;
+ return -1;
+}
+
+static void
+eal_alarm_callback(void *arg __rte_unused)
+{
+ struct timespec now;
+ struct alarm_entry *ap;
+
+ rte_spinlock_lock(&alarm_list_lk);
+ while ((ap = LIST_FIRST(&alarm_list)) !=NULL &&
+ clock_gettime(CLOCK_TYPE_ID, &now) == 0 &&
+ (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec &&
+ (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) {
+ ap->executing = 1;
+ ap->executing_id = pthread_self();
+ rte_spinlock_unlock(&alarm_list_lk);
+
+ ap->cb_fn(ap->cb_arg);
+
+ rte_spinlock_lock(&alarm_list_lk);
+
+ LIST_REMOVE(ap, next);
+ free(ap);
+ }
+
+ if (!LIST_EMPTY(&alarm_list)) {
+ struct itimerspec atime = { .it_interval = { 0, 0 } };
+
+ ap = LIST_FIRST(&alarm_list);
+ atime.it_value.tv_sec = ap->time.tv_sec;
+ atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US;
+ /* perform borrow for subtraction if necessary */
+ if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US))
+ atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US;
+
+ atime.it_value.tv_sec -= now.tv_sec;
+ atime.it_value.tv_nsec -= now.tv_nsec;
+ timerfd_settime(intr_handle.fd, 0, &atime, NULL);
+ }
+ rte_spinlock_unlock(&alarm_list_lk);
+}
+
+int
+rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+ struct timespec now;
+ int ret = 0;
+ struct alarm_entry *ap, *new_alarm;
+
+ /* Check parameters, including that us won't cause a uint64_t overflow */
+ if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
+ return -EINVAL;
+
+ new_alarm = calloc(1, sizeof(*new_alarm));
+ if (new_alarm == NULL)
+ return -ENOMEM;
+
+ /* use current time to calculate absolute time of alarm */
+ clock_gettime(CLOCK_TYPE_ID, &now);
+
+ new_alarm->cb_fn = cb_fn;
+ new_alarm->cb_arg = cb_arg;
+ new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S;
+ new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S);
+
+ rte_spinlock_lock(&alarm_list_lk);
+ if (!handler_registered) {
+ /* registration can fail, callback can be registered later */
+ if (rte_intr_callback_register(&intr_handle,
+ eal_alarm_callback, NULL) == 0)
+ handler_registered = 1;
+ }
+
+ if (LIST_EMPTY(&alarm_list))
+ LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
+ else {
+ LIST_FOREACH(ap, &alarm_list, next) {
+ if (ap->time.tv_sec > new_alarm->time.tv_sec ||
+ (ap->time.tv_sec == new_alarm->time.tv_sec &&
+ ap->time.tv_usec > new_alarm->time.tv_usec)){
+ LIST_INSERT_BEFORE(ap, new_alarm, next);
+ break;
+ }
+ if (LIST_NEXT(ap, next) == NULL) {
+ LIST_INSERT_AFTER(ap, new_alarm, next);
+ break;
+ }
+ }
+ }
+
+ if (LIST_FIRST(&alarm_list) == new_alarm) {
+ struct itimerspec alarm_time = {
+ .it_interval = {0, 0},
+ .it_value = {
+ .tv_sec = us / US_PER_S,
+ .tv_nsec = (us % US_PER_S) * NS_PER_US,
+ },
+ };
+ ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL);
+ }
+ rte_spinlock_unlock(&alarm_list_lk);
+
+ return ret;
+}
+
+int
+rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+ struct alarm_entry *ap, *ap_prev;
+ int count = 0;
+ int err = 0;
+ int executing;
+
+ if (!cb_fn) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ do {
+ executing = 0;
+ rte_spinlock_lock(&alarm_list_lk);
+ /* remove any matches at the start of the list */
+ while ((ap = LIST_FIRST(&alarm_list)) != NULL &&
+ cb_fn == ap->cb_fn &&
+ (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+ if (ap->executing == 0) {
+ LIST_REMOVE(ap, next);
+ free(ap);
+ count++;
+ } else {
+ /* If calling from other context, mark that alarm is executing
+ * so loop can spin till it finish. Otherwise we are trying to
+ * cancel our self - mark it by EINPROGRESS */
+ if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+ executing++;
+ else
+ err = EINPROGRESS;
+
+ break;
+ }
+ }
+ ap_prev = ap;
+
+ /* now go through list, removing entries not at start */
+ LIST_FOREACH(ap, &alarm_list, next) {
+ /* this won't be true first time through */
+ if (cb_fn == ap->cb_fn &&
+ (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+ if (ap->executing == 0) {
+ LIST_REMOVE(ap, next);
+ free(ap);
+ count++;
+ ap = ap_prev;
+ } else if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+ executing++;
+ else
+ err = EINPROGRESS;
+ }
+ ap_prev = ap;
+ }
+ rte_spinlock_unlock(&alarm_list_lk);
+ } while (executing != 0);
+
+ if (count == 0 && err == 0)
+ rte_errno = ENOENT;
+ else if (err)
+ rte_errno = err;
+
+ return count;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Red Hat, Inc.
+ */
+
+#include <elf.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAS_AUXV 1
+#endif
+#endif
+
+#include <rte_cpuflags.h>
+
+#ifndef HAS_AUXV
+static unsigned long
+getauxval(unsigned long type __rte_unused)
+{
+ errno = ENOTSUP;
+ return 0;
+}
+#endif
+
+#ifdef RTE_ARCH_64
+typedef Elf64_auxv_t Internal_Elfx_auxv_t;
+#else
+typedef Elf32_auxv_t Internal_Elfx_auxv_t;
+#endif
+
+/**
+ * Provides a method for retrieving values from the auxiliary vector and
+ * possibly running a string comparison.
+ *
+ * @return Always returns a result. When the result is 0, check errno
+ * to see if an error occurred during processing.
+ */
+static unsigned long
+_rte_cpu_getauxval(unsigned long type, const char *str)
+{
+ unsigned long val;
+
+ errno = 0;
+ val = getauxval(type);
+
+ if (!val && (errno == ENOTSUP || errno == ENOENT)) {
+ int auxv_fd = open("/proc/self/auxv", O_RDONLY);
+ Internal_Elfx_auxv_t auxv;
+
+ if (auxv_fd == -1)
+ return 0;
+
+ errno = ENOENT;
+ while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
+ if (auxv.a_type == type) {
+ errno = 0;
+ val = auxv.a_un.a_val;
+ if (str)
+ val = strcmp((const char *)val, str);
+ break;
+ }
+ }
+ close(auxv_fd);
+ }
+
+ return val;
+}
+
+unsigned long
+rte_cpu_getauxval(unsigned long type)
+{
+ return _rte_cpu_getauxval(type, NULL);
+}
+
+int
+rte_cpu_strcmp_auxval(unsigned long type, const char *str)
+{
+ return _rte_cpu_getauxval(type, str);
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifdef RTE_BACKTRACE
+#include <execinfo.h>
+#endif
+#include <stdarg.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#define BACKTRACE_SIZE 256
+
+/* dump the stack of the calling core */
+void rte_dump_stack(void)
+{
+#ifdef RTE_BACKTRACE
+ void *func[BACKTRACE_SIZE];
+ char **symb = NULL;
+ int size;
+
+ size = backtrace(func, BACKTRACE_SIZE);
+ symb = backtrace_symbols(func, size);
+
+ if (symb == NULL)
+ return;
+
+ while (size > 0) {
+ rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
+ "%d: [%s]\n", size, symb[size - 1]);
+ size --;
+ }
+
+ free(symb);
+#endif /* RTE_BACKTRACE */
+}
+
+/* not implemented in this environment */
+void rte_dump_registers(void)
+{
+ return;
+}
+
+/* call abort(), it will generate a coredump if enabled */
+void __rte_panic(const char *funcname, const char *format, ...)
+{
+ va_list ap;
+
+ rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
+ va_start(ap, format);
+ rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+ va_end(ap);
+ rte_dump_stack();
+ rte_dump_registers();
+ abort();
+}
+
+/*
+ * Like rte_panic this terminates the application. However, no traceback is
+ * provided and no core-dump is generated.
+ */
+void
+rte_exit(int exit_code, const char *format, ...)
+{
+ va_list ap;
+
+ if (exit_code != 0)
+ RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
+ " Cause: ", exit_code);
+
+ va_start(ap, format);
+ rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+ va_end(ap);
+
+#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
+ if (rte_eal_cleanup() != 0)
+ RTE_LOG(CRIT, EAL,
+ "EAL could not release all resources\n");
+ exit(exit_code);
+#else
+ rte_dump_stack();
+ rte_dump_registers();
+ abort();
+#endif
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+
+#include <rte_string_fns.h>
+#include <rte_log.h>
+#include <rte_compat.h>
+#include <rte_dev.h>
+#include <rte_malloc.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
+
+#include "eal_private.h"
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static bool monitor_started;
+static bool hotplug_handle;
+
+#define EAL_UEV_MSG_LEN 4096
+#define EAL_UEV_MSG_ELEM_LEN 128
+
+/*
+ * spinlock for device hot-unplug failure handling. If it try to access bus or
+ * device, such as handle sigbus on bus or handle memory failure for device
+ * just need to use this lock. It could protect the bus and the device to avoid
+ * race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
+static void dev_uev_handler(__rte_unused void *param);
+
+/* identify the system layer which reports this event. */
+enum eal_dev_event_subsystem {
+ EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */
+ EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */
+ EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */
+ EAL_DEV_EVENT_SUBSYSTEM_MAX
+};
+
+static void
+sigbus_action_recover(void)
+{
+ if (sigbus_need_recover) {
+ sigaction(SIGBUS, &sigbus_action_old, NULL);
+ sigbus_need_recover = 0;
+ }
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+ void *ctx __rte_unused)
+{
+ int ret;
+
+ RTE_LOG(DEBUG, EAL, "Thread catch SIGBUS, fault address:%p\n",
+ info->si_addr);
+
+ rte_spinlock_lock(&failure_handle_lock);
+ ret = rte_bus_sigbus_handler(info->si_addr);
+ rte_spinlock_unlock(&failure_handle_lock);
+ if (ret == -1) {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle SIGBUS for hot-unplug, "
+ "(rte_errno: %s)!", strerror(rte_errno));
+ } else if (ret == 1) {
+ if (sigbus_action_old.sa_flags == SA_SIGINFO
+ && sigbus_action_old.sa_sigaction) {
+ (*(sigbus_action_old.sa_sigaction))(signum,
+ info, ctx);
+ } else if (sigbus_action_old.sa_flags != SA_SIGINFO
+ && sigbus_action_old.sa_handler) {
+ (*(sigbus_action_old.sa_handler))(signum);
+ } else {
+ rte_exit(EXIT_FAILURE,
+ "Failed to handle generic SIGBUS!");
+ }
+ }
+
+ RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+ const void *_name)
+{
+ const char *name = _name;
+
+ return strcmp(dev->name, name);
+}
+
+static int
+dev_uev_socket_fd_create(void)
+{
+ struct sockaddr_nl addr;
+ int ret;
+
+ intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
+ SOCK_NONBLOCK,
+ NETLINK_KOBJECT_UEVENT);
+ if (intr_handle.fd < 0) {
+ RTE_LOG(ERR, EAL, "create uevent fd failed.\n");
+ return -1;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+ addr.nl_pid = 0;
+ addr.nl_groups = 0xffffffff;
+
+ ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr));
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ close(intr_handle.fd);
+ intr_handle.fd = -1;
+ return ret;
+}
+
+static int
+dev_uev_parse(const char *buf, struct rte_dev_event *event, int length)
+{
+ char action[EAL_UEV_MSG_ELEM_LEN];
+ char subsystem[EAL_UEV_MSG_ELEM_LEN];
+ char pci_slot_name[EAL_UEV_MSG_ELEM_LEN];
+ int i = 0;
+
+ memset(action, 0, EAL_UEV_MSG_ELEM_LEN);
+ memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN);
+ memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN);
+
+ while (i < length) {
+ for (; i < length; i++) {
+ if (*buf)
+ break;
+ buf++;
+ }
+ /**
+ * check device uevent from kernel side, no need to check
+ * uevent from udev.
+ */
+ if (!strncmp(buf, "libudev", 7)) {
+ buf += 7;
+ i += 7;
+ return -1;
+ }
+ if (!strncmp(buf, "ACTION=", 7)) {
+ buf += 7;
+ i += 7;
+ strlcpy(action, buf, sizeof(action));
+ } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+ buf += 10;
+ i += 10;
+ strlcpy(subsystem, buf, sizeof(subsystem));
+ } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+ buf += 14;
+ i += 14;
+ strlcpy(pci_slot_name, buf, sizeof(subsystem));
+ event->devname = strdup(pci_slot_name);
+ }
+ for (; i < length; i++) {
+ if (*buf == '\0')
+ break;
+ buf++;
+ }
+ }
+
+ /* parse the subsystem layer */
+ if (!strncmp(subsystem, "uio", 3))
+ event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO;
+ else if (!strncmp(subsystem, "pci", 3))
+ event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI;
+ else if (!strncmp(subsystem, "vfio", 4))
+ event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO;
+ else
+ return -1;
+
+ /* parse the action type */
+ if (!strncmp(action, "add", 3))
+ event->type = RTE_DEV_EVENT_ADD;
+ else if (!strncmp(action, "remove", 6))
+ event->type = RTE_DEV_EVENT_REMOVE;
+ else
+ return -1;
+ return 0;
+}
+
+static void
+dev_delayed_unregister(void *param)
+{
+ rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param);
+ close(intr_handle.fd);
+ intr_handle.fd = -1;
+}
+
+static void
+dev_uev_handler(__rte_unused void *param)
+{
+ struct rte_dev_event uevent;
+ int ret;
+ char buf[EAL_UEV_MSG_LEN];
+ struct rte_bus *bus;
+ struct rte_device *dev;
+ const char *busname = "";
+
+ memset(&uevent, 0, sizeof(struct rte_dev_event));
+ memset(buf, 0, EAL_UEV_MSG_LEN);
+
+ ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT);
+ if (ret < 0 && errno == EAGAIN)
+ return;
+ else if (ret <= 0) {
+ /* connection is closed or broken, can not up again. */
+ RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n");
+ rte_eal_alarm_set(1, dev_delayed_unregister, NULL);
+ return;
+ }
+
+ ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN);
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "It is not an valid event "
+ "that need to be handle.\n");
+ return;
+ }
+
+ RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
+ uevent.devname, uevent.type, uevent.subsystem);
+
+ switch (uevent.subsystem) {
+ case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+ case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+ busname = "pci";
+ break;
+ default:
+ break;
+ }
+
+ if (uevent.devname) {
+ if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
+ rte_spinlock_lock(&failure_handle_lock);
+ bus = rte_bus_find_by_name(busname);
+ if (bus == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+ busname);
+ goto failure_handle_err;
+ }
+
+ dev = bus->find_device(NULL, cmp_dev_name,
+ uevent.devname);
+ if (dev == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+ "bus (%s)\n", uevent.devname, busname);
+ goto failure_handle_err;
+ }
+
+ ret = bus->hot_unplug_handler(dev);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
+ "for device (%s)\n", dev->name);
+ }
+ rte_spinlock_unlock(&failure_handle_lock);
+ }
+ rte_dev_event_callback_process(uevent.devname, uevent.type);
+ }
+
+ return;
+
+failure_handle_err:
+ rte_spinlock_unlock(&failure_handle_lock);
+}
+
+int
+rte_dev_event_monitor_start(void)
+{
+ int ret;
+
+ if (monitor_started)
+ return 0;
+
+ ret = dev_uev_socket_fd_create();
+ if (ret) {
+ RTE_LOG(ERR, EAL, "error create device event fd.\n");
+ return -1;
+ }
+
+ intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT;
+ ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "fail to register uevent callback.\n");
+ return -1;
+ }
+
+ monitor_started = true;
+
+ return 0;
+}
+
+int
+rte_dev_event_monitor_stop(void)
+{
+ int ret;
+
+ if (!monitor_started)
+ return 0;
+
+ ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler,
+ (void *)-1);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n");
+ return ret;
+ }
+
+ close(intr_handle.fd);
+ intr_handle.fd = -1;
+ monitor_started = false;
+
+ return 0;
+}
+
+int
+dev_sigbus_handler_register(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ rte_errno = 0;
+
+ if (sigbus_need_recover)
+ return 0;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = SA_SIGINFO;
+ action.sa_mask = mask;
+ action.sa_sigaction = sigbus_handler;
+ sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
+ return rte_errno;
+}
+
+int
+dev_sigbus_handler_unregister(void)
+{
+ rte_errno = 0;
+
+ sigbus_action_recover();
+
+ return rte_errno;
+}
+
+int
+rte_dev_hotplug_handle_enable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_register();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to register sigbus handler for devices.\n");
+
+ hotplug_handle = true;
+
+ return ret;
+}
+
+int
+rte_dev_hotplug_handle_disable(void)
+{
+ int ret = 0;
+
+ ret = dev_sigbus_handler_unregister();
+ if (ret < 0)
+ RTE_LOG(ERR, EAL,
+ "fail to unregister sigbus handler for devices.\n");
+
+ hotplug_handle = false;
+
+ return ret;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <linux/mman.h> /* for hugetlb-related flags */
+
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_common.h>
+#include "rte_string_fns.h"
+#include "eal_internal_cfg.h"
+#include "eal_hugepages.h"
+#include "eal_filesystem.h"
+
+static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
+{
+ void *retval;
+ int fd = open(filename, flags, 0600);
+ if (fd < 0)
+ return NULL;
+ if (ftruncate(fd, mem_size) < 0) {
+ close(fd);
+ return NULL;
+ }
+ retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ close(fd);
+ return retval;
+}
+
+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+ return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
+
+static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val)
+{
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "%s/%s/%s",
+ sys_dir_path, subdir, file);
+ return eal_parse_sysfs_value(path, val);
+}
+
+/* this function is only called from eal_hugepage_info_init which itself
+ * is only called from a primary process */
+static uint32_t
+get_num_hugepages(const char *subdir)
+{
+ unsigned long resv_pages, num_pages, over_pages, surplus_pages;
+ const char *nr_hp_file = "free_hugepages";
+ const char *nr_rsvd_file = "resv_hugepages";
+ const char *nr_over_file = "nr_overcommit_hugepages";
+ const char *nr_splus_file = "surplus_hugepages";
+
+ /* first, check how many reserved pages kernel reports */
+ if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)
+ return 0;
+
+ if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)
+ return 0;
+
+ if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)
+ over_pages = 0;
+
+ if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)
+ surplus_pages = 0;
+
+ /* adjust num_pages */
+ if (num_pages >= resv_pages)
+ num_pages -= resv_pages;
+ else if (resv_pages)
+ num_pages = 0;
+
+ if (over_pages >= surplus_pages)
+ over_pages -= surplus_pages;
+ else
+ over_pages = 0;
+
+ if (num_pages == 0 && over_pages == 0)
+ RTE_LOG(WARNING, EAL, "No available hugepages reported in %s\n",
+ subdir);
+
+ num_pages += over_pages;
+ if (num_pages < over_pages) /* overflow */
+ num_pages = UINT32_MAX;
+
+ /* we want to return a uint32_t and more than this looks suspicious
+ * anyway ... */
+ if (num_pages > UINT32_MAX)
+ num_pages = UINT32_MAX;
+
+ return num_pages;
+}
+
+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+ char path[PATH_MAX], socketpath[PATH_MAX];
+ DIR *socketdir;
+ unsigned long num_pages = 0;
+ const char *nr_hp_file = "free_hugepages";
+
+ snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+ sys_pages_numa_dir_path, socket);
+
+ socketdir = opendir(socketpath);
+ if (socketdir) {
+ /* Keep calm and carry on */
+ closedir(socketdir);
+ } else {
+ /* Can't find socket dir, so ignore it */
+ return 0;
+ }
+
+ snprintf(path, sizeof(path), "%s/%s/%s",
+ socketpath, subdir, nr_hp_file);
+ if (eal_parse_sysfs_value(path, &num_pages) < 0)
+ return 0;
+
+ if (num_pages == 0)
+ RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+ subdir);
+
+ /*
+ * we want to return a uint32_t and more than this looks suspicious
+ * anyway ...
+ */
+ if (num_pages > UINT32_MAX)
+ num_pages = UINT32_MAX;
+
+ return num_pages;
+}
+
+static uint64_t
+get_default_hp_size(void)
+{
+ const char proc_meminfo[] = "/proc/meminfo";
+ const char str_hugepagesz[] = "Hugepagesize:";
+ unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
+ char buffer[256];
+ unsigned long long size = 0;
+
+ FILE *fd = fopen(proc_meminfo, "r");
+ if (fd == NULL)
+ rte_panic("Cannot open %s\n", proc_meminfo);
+ while(fgets(buffer, sizeof(buffer), fd)){
+ if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
+ size = rte_str_to_size(&buffer[hugepagesz_len]);
+ break;
+ }
+ }
+ fclose(fd);
+ if (size == 0)
+ rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
+ return size;
+}
+
+static int
+get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
+{
+ enum proc_mount_fieldnames {
+ DEVICE = 0,
+ MOUNTPT,
+ FSTYPE,
+ OPTIONS,
+ _FIELDNAME_MAX
+ };
+ static uint64_t default_size = 0;
+ const char proc_mounts[] = "/proc/mounts";
+ const char hugetlbfs_str[] = "hugetlbfs";
+ const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
+ const char pagesize_opt[] = "pagesize=";
+ const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
+ const char split_tok = ' ';
+ char *splitstr[_FIELDNAME_MAX];
+ char buf[BUFSIZ];
+ int retval = -1;
+
+ FILE *fd = fopen(proc_mounts, "r");
+ if (fd == NULL)
+ rte_panic("Cannot open %s\n", proc_mounts);
+
+ if (default_size == 0)
+ default_size = get_default_hp_size();
+
+ while (fgets(buf, sizeof(buf), fd)){
+ if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
+ split_tok) != _FIELDNAME_MAX) {
+ RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
+ break; /* return NULL */
+ }
+
+ /* we have a specified --huge-dir option, only examine that dir */
+ if (internal_config.hugepage_dir != NULL &&
+ strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0)
+ continue;
+
+ if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){
+ const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
+
+ /* if no explicit page size, the default page size is compared */
+ if (pagesz_str == NULL){
+ if (hugepage_sz == default_size){
+ strlcpy(hugedir, splitstr[MOUNTPT], len);
+ retval = 0;
+ break;
+ }
+ }
+ /* there is an explicit page size, so check it */
+ else {
+ uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
+ if (pagesz == hugepage_sz) {
+ strlcpy(hugedir, splitstr[MOUNTPT], len);
+ retval = 0;
+ break;
+ }
+ }
+ } /* end if strncmp hugetlbfs */
+ } /* end while fgets */
+
+ fclose(fd);
+ return retval;
+}
+
+/*
+ * Clear the hugepage directory of whatever hugepage files
+ * there are. Checks if the file is locked (i.e.
+ * if it's in use by another DPDK process).
+ */
+static int
+clear_hugedir(const char * hugedir)
+{
+ DIR *dir;
+ struct dirent *dirent;
+ int dir_fd, fd, lck_result;
+ const char filter[] = "*map_*"; /* matches hugepage files */
+
+ /* open directory */
+ dir = opendir(hugedir);
+ if (!dir) {
+ RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
+ hugedir);
+ goto error;
+ }
+ dir_fd = dirfd(dir);
+
+ dirent = readdir(dir);
+ if (!dirent) {
+ RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
+ hugedir);
+ goto error;
+ }
+
+ while(dirent != NULL){
+ /* skip files that don't match the hugepage pattern */
+ if (fnmatch(filter, dirent->d_name, 0) > 0) {
+ dirent = readdir(dir);
+ continue;
+ }
+
+ /* try and lock the file */
+ fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+
+ /* skip to next file */
+ if (fd == -1) {
+ dirent = readdir(dir);
+ continue;
+ }
+
+ /* non-blocking lock */
+ lck_result = flock(fd, LOCK_EX | LOCK_NB);
+
+ /* if lock succeeds, remove the file */
+ if (lck_result != -1)
+ unlinkat(dir_fd, dirent->d_name, 0);
+ close (fd);
+ dirent = readdir(dir);
+ }
+
+ closedir(dir);
+ return 0;
+
+error:
+ if (dir)
+ closedir(dir);
+
+ RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
+ strerror(errno));
+
+ return -1;
+}
+
+static int
+compare_hpi(const void *a, const void *b)
+{
+ const struct hugepage_info *hpi_a = a;
+ const struct hugepage_info *hpi_b = b;
+
+ return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
+}
+
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+ uint64_t total_pages = 0;
+ unsigned int i;
+
+ /*
+ * first, try to put all hugepages into relevant sockets, but
+ * if first attempts fails, fall back to collecting all pages
+ * in one socket and sorting them later
+ */
+ total_pages = 0;
+ /* we also don't want to do this for legacy init */
+ if (!internal_config.legacy_mem)
+ for (i = 0; i < rte_socket_count(); i++) {
+ int socket = rte_socket_id_by_idx(i);
+ unsigned int num_pages =
+ get_num_hugepages_on_node(
+ dirent->d_name, socket);
+ hpi->num_pages[socket] = num_pages;
+ total_pages += num_pages;
+ }
+ /*
+ * we failed to sort memory from the get go, so fall
+ * back to old way
+ */
+ if (total_pages == 0) {
+ hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+ /* for 32-bit systems, limit number of hugepages to
+ * 1GB per page size */
+ hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+ RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+ }
+}
+
+static int
+hugepage_info_init(void)
+{ const char dirent_start_text[] = "hugepages-";
+ const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
+ unsigned int i, num_sizes = 0;
+ DIR *dir;
+ struct dirent *dirent;
+
+ dir = opendir(sys_dir_path);
+ if (dir == NULL) {
+ RTE_LOG(ERR, EAL,
+ "Cannot open directory %s to read system hugepage info\n",
+ sys_dir_path);
+ return -1;
+ }
+
+ for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
+ struct hugepage_info *hpi;
+
+ if (strncmp(dirent->d_name, dirent_start_text,
+ dirent_start_len) != 0)
+ continue;
+
+ if (num_sizes >= MAX_HUGEPAGE_SIZES)
+ break;
+
+ hpi = &internal_config.hugepage_info[num_sizes];
+ hpi->hugepage_sz =
+ rte_str_to_size(&dirent->d_name[dirent_start_len]);
+
+ /* first, check if we have a mountpoint */
+ if (get_hugepage_dir(hpi->hugepage_sz,
+ hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
+ uint32_t num_pages;
+
+ num_pages = get_num_hugepages(dirent->d_name);
+ if (num_pages > 0)
+ RTE_LOG(NOTICE, EAL,
+ "%" PRIu32 " hugepages of size "
+ "%" PRIu64 " reserved, but no mounted "
+ "hugetlbfs found for that size\n",
+ num_pages, hpi->hugepage_sz);
+ /* if we have kernel support for reserving hugepages
+ * through mmap, and we're in in-memory mode, treat this
+ * page size as valid. we cannot be in legacy mode at
+ * this point because we've checked this earlier in the
+ * init process.
+ */
+#ifdef MAP_HUGE_SHIFT
+ if (internal_config.in_memory) {
+ RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
+ "hugepages of size %" PRIu64 " bytes "
+ "will be allocated anonymously\n",
+ hpi->hugepage_sz);
+ calc_num_pages(hpi, dirent);
+ num_sizes++;
+ }
+#endif
+ continue;
+ }
+
+ /* try to obtain a writelock */
+ hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
+
+ /* if blocking lock failed */
+ if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
+ RTE_LOG(CRIT, EAL,
+ "Failed to lock hugepage directory!\n");
+ break;
+ }
+ /* clear out the hugepages dir from unused pages */
+ if (clear_hugedir(hpi->hugedir) == -1)
+ break;
+
+ calc_num_pages(hpi, dirent);
+
+ num_sizes++;
+ }
+ closedir(dir);
+
+ /* something went wrong, and we broke from the for loop above */
+ if (dirent != NULL)
+ return -1;
+
+ internal_config.num_hugepage_sizes = num_sizes;
+
+ /* sort the page directory entries by size, largest to smallest */
+ qsort(&internal_config.hugepage_info[0], num_sizes,
+ sizeof(internal_config.hugepage_info[0]), compare_hpi);
+
+ /* now we have all info, check we have at least one valid size */
+ for (i = 0; i < num_sizes; i++) {
+ /* pages may no longer all be on socket 0, so check all */
+ unsigned int j, num_pages = 0;
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+ num_pages += hpi->num_pages[j];
+ if (num_pages > 0)
+ return 0;
+ }
+
+ /* no valid hugepage mounts available, return error */
+ return -1;
+}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+ struct hugepage_info *hpi, *tmp_hpi;
+ unsigned int i;
+
+ if (hugepage_info_init() < 0)
+ return -1;
+
+ /* for no shared files mode, we're done */
+ if (internal_config.no_shconf)
+ return 0;
+
+ hpi = &internal_config.hugepage_info[0];
+
+ tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+ return -1;
+ }
+
+ memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
+
+ /* we've copied file descriptors along with everything else, but they
+ * will be invalid in secondary process, so overwrite them
+ */
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ struct hugepage_info *tmp = &tmp_hpi[i];
+ tmp->lock_descriptor = -1;
+ }
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
+
+int eal_hugepage_info_read(void)
+{
+ struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+ struct hugepage_info *tmp_hpi;
+
+ tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+ sizeof(internal_config.hugepage_info));
+ if (tmp_hpi == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+ return -1;
+ }
+
+ memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+ if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+ return -1;
+ }
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/queue.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/epoll.h>
+#include <sys/signalfd.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <rte_pause.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_vfio.h"
+#include "eal_thread.h"
+
+#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
+#define NB_OTHER_INTR 1
+
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
+/**
+ * union for pipe fds.
+ */
+union intr_pipefds{
+ struct {
+ int pipefd[2];
+ };
+ struct {
+ int readfd;
+ int writefd;
+ };
+};
+
+/**
+ * union buffer for reading on different devices
+ */
+union rte_intr_read_buffer {
+ int uio_intr_count; /* for uio device */
+#ifdef VFIO_PRESENT
+ uint64_t vfio_intr_count; /* for vfio device */
+#endif
+ uint64_t timerfd_num; /* for timerfd */
+ char charbuf[16]; /* for others */
+};
+
+TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
+TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
+
+struct rte_intr_callback {
+ TAILQ_ENTRY(rte_intr_callback) next;
+ rte_intr_callback_fn cb_fn; /**< callback address */
+ void *cb_arg; /**< parameter for callback */
+ uint8_t pending_delete; /**< delete after callback is called */
+ rte_intr_unregister_callback_fn ucb_fn; /**< fn to call before cb is deleted */
+};
+
+struct rte_intr_source {
+ TAILQ_ENTRY(rte_intr_source) next;
+ struct rte_intr_handle intr_handle; /**< interrupt handle */
+ struct rte_intr_cb_list callbacks; /**< user callbacks */
+ uint32_t active;
+};
+
+/* global spinlock for interrupt data operation */
+static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* union buffer for pipe read/write */
+static union intr_pipefds intr_pipe;
+
+/* interrupt sources list */
+static struct rte_intr_source_list intr_sources;
+
+/* interrupt handling thread */
+static pthread_t intr_thread;
+
+/* VFIO interrupts */
+#ifdef VFIO_PRESENT
+
+#define IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+ sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
+
+/* enable legacy (INTx) interrupts */
+static int
+vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ /* enable INTx */
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ /* unmask INTx after enabling */
+ memset(irq_set, 0, len);
+ len = sizeof(struct vfio_irq_set);
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+ irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+ return 0;
+}
+
+/* disable legacy (INTx) interrupts */
+static int
+vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ /* mask interrupts before disabling */
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
+ irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ /* disable INTx*/
+ memset(irq_set, 0, len);
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL,
+ "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
+ return -1;
+ }
+ return 0;
+}
+
+/* unmask/ack legacy (INTx) interrupts */
+static int
+vfio_ack_intx(const struct rte_intr_handle *intr_handle)
+{
+ struct vfio_irq_set irq_set;
+
+ /* unmask INTx */
+ memset(&irq_set, 0, sizeof(irq_set));
+ irq_set.argsz = sizeof(irq_set);
+ irq_set.count = 1;
+ irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+ irq_set.index = VFIO_PCI_INTX_IRQ_INDEX;
+ irq_set.start = 0;
+
+ if (ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, &irq_set)) {
+ RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+ return 0;
+}
+
+/* enable MSI interrupts */
+static int
+vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
+ int len, ret;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+ return 0;
+}
+
+/* disable MSI interrupts */
+static int
+vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL,
+ "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
+
+ return ret;
+}
+
+/* enable MSI-X interrupts */
+static int
+vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
+ int len, ret;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
+ irq_set->count = intr_handle->max_intr ?
+ (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
+ RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ /* INTR vector offset 0 reserve for non-efds mapping */
+ fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
+ memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
+ sizeof(*intr_handle->efds) * intr_handle->nb_efd);
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* disable MSI-X interrupts */
+static int
+vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL,
+ "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
+
+ return ret;
+}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/* enable req notifier */
+static int
+vfio_enable_req(const struct rte_intr_handle *intr_handle)
+{
+ int len, ret;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ struct vfio_irq_set *irq_set;
+ int *fd_ptr;
+
+ len = sizeof(irq_set_buf);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 1;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+ fd_ptr = (int *) &irq_set->data;
+ *fd_ptr = intr_handle->fd;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* disable req notifier */
+static int
+vfio_disable_req(const struct rte_intr_handle *intr_handle)
+{
+ struct vfio_irq_set *irq_set;
+ char irq_set_buf[IRQ_SET_BUF_LEN];
+ int len, ret;
+
+ len = sizeof(struct vfio_irq_set);
+
+ irq_set = (struct vfio_irq_set *) irq_set_buf;
+ irq_set->argsz = len;
+ irq_set->count = 0;
+ irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+ irq_set->start = 0;
+
+ ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ if (ret)
+ RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
+ intr_handle->fd);
+
+ return ret;
+}
+#endif
+#endif
+
+static int
+uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+ unsigned char command_high;
+
+ /* use UIO config file descriptor for uio_pci_generic */
+ if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+ RTE_LOG(ERR, EAL,
+ "Error reading interrupts status for fd %d\n",
+ intr_handle->uio_cfg_fd);
+ return -1;
+ }
+ /* disable interrupts */
+ command_high |= 0x4;
+ if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+ RTE_LOG(ERR, EAL,
+ "Error disabling interrupts for fd %d\n",
+ intr_handle->uio_cfg_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+ unsigned char command_high;
+
+ /* use UIO config file descriptor for uio_pci_generic */
+ if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+ RTE_LOG(ERR, EAL,
+ "Error reading interrupts status for fd %d\n",
+ intr_handle->uio_cfg_fd);
+ return -1;
+ }
+ /* enable interrupts */
+ command_high &= ~0x4;
+ if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+ RTE_LOG(ERR, EAL,
+ "Error enabling interrupts for fd %d\n",
+ intr_handle->uio_cfg_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+uio_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+ const int value = 0;
+
+ if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Error disabling interrupts for fd %d (%s)\n",
+ intr_handle->fd, strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+static int
+uio_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+ const int value = 1;
+
+ if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Error enabling interrupts for fd %d (%s)\n",
+ intr_handle->fd, strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb, void *cb_arg)
+{
+ int ret, wake_thread;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *callback;
+
+ wake_thread = 0;
+
+ /* first do parameter checking */
+ if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
+ RTE_LOG(ERR, EAL,
+ "Registering with invalid input parameter\n");
+ return -EINVAL;
+ }
+
+ /* allocate a new interrupt callback entity */
+ callback = calloc(1, sizeof(*callback));
+ if (callback == NULL) {
+ RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+ return -ENOMEM;
+ }
+ callback->cb_fn = cb;
+ callback->cb_arg = cb_arg;
+ callback->pending_delete = 0;
+ callback->ucb_fn = NULL;
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* check if there is at least one callback registered for the fd */
+ TAILQ_FOREACH(src, &intr_sources, next) {
+ if (src->intr_handle.fd == intr_handle->fd) {
+ /* we had no interrupts for this */
+ if (TAILQ_EMPTY(&src->callbacks))
+ wake_thread = 1;
+
+ TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+ ret = 0;
+ break;
+ }
+ }
+
+ /* no existing callbacks for this - add new source */
+ if (src == NULL) {
+ src = calloc(1, sizeof(*src));
+ if (src == NULL) {
+ RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+ free(callback);
+ ret = -ENOMEM;
+ } else {
+ src->intr_handle = *intr_handle;
+ TAILQ_INIT(&src->callbacks);
+ TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+ TAILQ_INSERT_TAIL(&intr_sources, src, next);
+ wake_thread = 1;
+ ret = 0;
+ }
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+
+ /**
+ * check if need to notify the pipe fd waited by epoll_wait to
+ * rebuild the wait list.
+ */
+ if (wake_thread)
+ if (write(intr_pipe.writefd, "1", 1) < 0)
+ return -EPIPE;
+
+ return ret;
+}
+
+int
+rte_intr_callback_unregister_pending(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg,
+ rte_intr_unregister_callback_fn ucb_fn)
+{
+ int ret;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *cb, *next;
+
+ /* do parameter checking first */
+ if (intr_handle == NULL || intr_handle->fd < 0) {
+ RTE_LOG(ERR, EAL,
+ "Unregistering with invalid input parameter\n");
+ return -EINVAL;
+ }
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* check if the insterrupt source for the fd is existent */
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd == intr_handle->fd)
+ break;
+
+ /* No interrupt source registered for the fd */
+ if (src == NULL) {
+ ret = -ENOENT;
+
+ /* only usable if the source is active */
+ } else if (src->active == 0) {
+ ret = -EAGAIN;
+
+ } else {
+ ret = 0;
+
+ /* walk through the callbacks and mark all that match. */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+ cb->cb_arg == cb_arg)) {
+ cb->pending_delete = 1;
+ cb->ucb_fn = ucb_fn;
+ ret++;
+ }
+ }
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+
+ return ret;
+}
+
+int
+rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
+ rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+ int ret;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *cb, *next;
+
+ /* do parameter checking first */
+ if (intr_handle == NULL || intr_handle->fd < 0) {
+ RTE_LOG(ERR, EAL,
+ "Unregistering with invalid input parameter\n");
+ return -EINVAL;
+ }
+
+ rte_spinlock_lock(&intr_lock);
+
+ /* check if the insterrupt source for the fd is existent */
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd == intr_handle->fd)
+ break;
+
+ /* No interrupt source registered for the fd */
+ if (src == NULL) {
+ ret = -ENOENT;
+
+ /* interrupt source has some active callbacks right now. */
+ } else if (src->active != 0) {
+ ret = -EAGAIN;
+
+ /* ok to remove. */
+ } else {
+ ret = 0;
+
+ /*walk through the callbacks and remove all that match. */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+
+ next = TAILQ_NEXT(cb, next);
+
+ if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+ cb->cb_arg == cb_arg)) {
+ TAILQ_REMOVE(&src->callbacks, cb, next);
+ free(cb);
+ ret++;
+ }
+ }
+
+ /* all callbacks for that source are removed. */
+ if (TAILQ_EMPTY(&src->callbacks)) {
+ TAILQ_REMOVE(&intr_sources, src, next);
+ free(src);
+ }
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+
+ /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
+ if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
+ ret = -EPIPE;
+ }
+
+ return ret;
+}
+
+int
+rte_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+ return -1;
+
+ switch (intr_handle->type){
+ /* write to the uio fd to enable the interrupt */
+ case RTE_INTR_HANDLE_UIO:
+ if (uio_intr_enable(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_UIO_INTX:
+ if (uio_intx_intr_enable(intr_handle))
+ return -1;
+ break;
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_ALARM:
+ return -1;
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ if (vfio_enable_msix(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ if (vfio_enable_msi(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ if (vfio_enable_intx(intr_handle))
+ return -1;
+ break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_enable_req(intr_handle))
+ return -1;
+ break;
+#endif
+#endif
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ return -1;
+ /* unknown handle type */
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * PMD generally calls this function at the end of its IRQ callback.
+ * Internally, it unmasks the interrupt if possible.
+ *
+ * For INTx, unmasking is required as the interrupt is auto-masked prior to
+ * invoking callback.
+ *
+ * For MSI/MSI-X, unmasking is typically not needed as the interrupt is not
+ * auto-masked. In fact, for interrupt handle types VFIO_MSIX and VFIO_MSI,
+ * this function is no-op.
+ */
+int
+rte_intr_ack(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+ return -1;
+
+ switch (intr_handle->type) {
+ /* Both acking and enabling are same for UIO */
+ case RTE_INTR_HANDLE_UIO:
+ if (uio_intr_enable(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_UIO_INTX:
+ if (uio_intx_intr_enable(intr_handle))
+ return -1;
+ break;
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_ALARM:
+ return -1;
+#ifdef VFIO_PRESENT
+ /* VFIO MSI* is implicitly acked unlike INTx, nothing to do */
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ return 0;
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ if (vfio_ack_intx(intr_handle))
+ return -1;
+ break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ return -1;
+#endif
+#endif
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ return -1;
+ /* unknown handle type */
+ default:
+ RTE_LOG(ERR, EAL, "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+rte_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 0;
+
+ if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+ return -1;
+
+ switch (intr_handle->type){
+ /* write to the uio fd to disable the interrupt */
+ case RTE_INTR_HANDLE_UIO:
+ if (uio_intr_disable(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_UIO_INTX:
+ if (uio_intx_intr_disable(intr_handle))
+ return -1;
+ break;
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_ALARM:
+ return -1;
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ if (vfio_disable_msix(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ if (vfio_disable_msi(intr_handle))
+ return -1;
+ break;
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ if (vfio_disable_intx(intr_handle))
+ return -1;
+ break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ if (vfio_disable_req(intr_handle))
+ return -1;
+ break;
+#endif
+#endif
+ /* not used at this moment */
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ return -1;
+ /* unknown handle type */
+ default:
+ RTE_LOG(ERR, EAL,
+ "Unknown handle type of fd %d\n",
+ intr_handle->fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+eal_intr_process_interrupts(struct epoll_event *events, int nfds)
+{
+ bool call = false;
+ int n, bytes_read, rv;
+ struct rte_intr_source *src;
+ struct rte_intr_callback *cb, *next;
+ union rte_intr_read_buffer buf;
+ struct rte_intr_callback active_cb;
+
+ for (n = 0; n < nfds; n++) {
+
+ /**
+ * if the pipe fd is ready to read, return out to
+ * rebuild the wait list.
+ */
+ if (events[n].data.fd == intr_pipe.readfd){
+ int r = read(intr_pipe.readfd, buf.charbuf,
+ sizeof(buf.charbuf));
+ RTE_SET_USED(r);
+ return -1;
+ }
+ rte_spinlock_lock(&intr_lock);
+ TAILQ_FOREACH(src, &intr_sources, next)
+ if (src->intr_handle.fd ==
+ events[n].data.fd)
+ break;
+ if (src == NULL){
+ rte_spinlock_unlock(&intr_lock);
+ continue;
+ }
+
+ /* mark this interrupt source as active and release the lock. */
+ src->active = 1;
+ rte_spinlock_unlock(&intr_lock);
+
+ /* set the length to be read dor different handle type */
+ switch (src->intr_handle.type) {
+ case RTE_INTR_HANDLE_UIO:
+ case RTE_INTR_HANDLE_UIO_INTX:
+ bytes_read = sizeof(buf.uio_intr_count);
+ break;
+ case RTE_INTR_HANDLE_ALARM:
+ bytes_read = sizeof(buf.timerfd_num);
+ break;
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ bytes_read = sizeof(buf.vfio_intr_count);
+ break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+ case RTE_INTR_HANDLE_VFIO_REQ:
+ bytes_read = 0;
+ call = true;
+ break;
+#endif
+#endif
+ case RTE_INTR_HANDLE_VDEV:
+ case RTE_INTR_HANDLE_EXT:
+ bytes_read = 0;
+ call = true;
+ break;
+ case RTE_INTR_HANDLE_DEV_EVENT:
+ bytes_read = 0;
+ call = true;
+ break;
+ default:
+ bytes_read = 1;
+ break;
+ }
+
+ if (bytes_read > 0) {
+ /**
+ * read out to clear the ready-to-be-read flag
+ * for epoll_wait.
+ */
+ bytes_read = read(events[n].data.fd, &buf, bytes_read);
+ if (bytes_read < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK)
+ continue;
+
+ RTE_LOG(ERR, EAL, "Error reading from file "
+ "descriptor %d: %s\n",
+ events[n].data.fd,
+ strerror(errno));
+ /*
+ * The device is unplugged or buggy, remove
+ * it as an interrupt source and return to
+ * force the wait list to be rebuilt.
+ */
+ rte_spinlock_lock(&intr_lock);
+ TAILQ_REMOVE(&intr_sources, src, next);
+ rte_spinlock_unlock(&intr_lock);
+
+ for (cb = TAILQ_FIRST(&src->callbacks); cb;
+ cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ TAILQ_REMOVE(&src->callbacks, cb, next);
+ free(cb);
+ }
+ free(src);
+ return -1;
+ } else if (bytes_read == 0)
+ RTE_LOG(ERR, EAL, "Read nothing from file "
+ "descriptor %d\n", events[n].data.fd);
+ else
+ call = true;
+ }
+
+ /* grab a lock, again to call callbacks and update status. */
+ rte_spinlock_lock(&intr_lock);
+
+ if (call) {
+
+ /* Finally, call all callbacks. */
+ TAILQ_FOREACH(cb, &src->callbacks, next) {
+
+ /* make a copy and unlock. */
+ active_cb = *cb;
+ rte_spinlock_unlock(&intr_lock);
+
+ /* call the actual callback */
+ active_cb.cb_fn(active_cb.cb_arg);
+
+ /*get the lock back. */
+ rte_spinlock_lock(&intr_lock);
+ }
+ }
+ /* we done with that interrupt source, release it. */
+ src->active = 0;
+
+ rv = 0;
+
+ /* check if any callback are supposed to be removed */
+ for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+ next = TAILQ_NEXT(cb, next);
+ if (cb->pending_delete) {
+ TAILQ_REMOVE(&src->callbacks, cb, next);
+ if (cb->ucb_fn)
+ cb->ucb_fn(&src->intr_handle, cb->cb_arg);
+ free(cb);
+ rv++;
+ }
+ }
+
+ /* all callbacks for that source are removed. */
+ if (TAILQ_EMPTY(&src->callbacks)) {
+ TAILQ_REMOVE(&intr_sources, src, next);
+ free(src);
+ }
+
+ /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
+ if (rv >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
+ rte_spinlock_unlock(&intr_lock);
+ return -EPIPE;
+ }
+
+ rte_spinlock_unlock(&intr_lock);
+ }
+
+ return 0;
+}
+
+/**
+ * It handles all the interrupts.
+ *
+ * @param pfd
+ * epoll file descriptor.
+ * @param totalfds
+ * The number of file descriptors added in epoll.
+ *
+ * @return
+ * void
+ */
+static void
+eal_intr_handle_interrupts(int pfd, unsigned totalfds)
+{
+ struct epoll_event events[totalfds];
+ int nfds = 0;
+
+ for(;;) {
+ nfds = epoll_wait(pfd, events, totalfds,
+ EAL_INTR_EPOLL_WAIT_FOREVER);
+ /* epoll_wait fail */
+ if (nfds < 0) {
+ if (errno == EINTR)
+ continue;
+ RTE_LOG(ERR, EAL,
+ "epoll_wait returns with fail\n");
+ return;
+ }
+ /* epoll_wait timeout, will never happens here */
+ else if (nfds == 0)
+ continue;
+ /* epoll_wait has at least one fd ready to read */
+ if (eal_intr_process_interrupts(events, nfds) < 0)
+ return;
+ }
+}
+
+/**
+ * It builds/rebuilds up the epoll file descriptor with all the
+ * file descriptors being waited on. Then handles the interrupts.
+ *
+ * @param arg
+ * pointer. (unused)
+ *
+ * @return
+ * never return;
+ */
+static __attribute__((noreturn)) void *
+eal_intr_thread_main(__rte_unused void *arg)
+{
+ /* host thread, never break out */
+ for (;;) {
+ /* build up the epoll fd with all descriptors we are to
+ * wait on then pass it to the handle_interrupts function
+ */
+ static struct epoll_event pipe_event = {
+ .events = EPOLLIN | EPOLLPRI,
+ };
+ struct rte_intr_source *src;
+ unsigned numfds = 0;
+
+ /* create epoll fd */
+ int pfd = epoll_create(1);
+ if (pfd < 0)
+ rte_panic("Cannot create epoll instance\n");
+
+ pipe_event.data.fd = intr_pipe.readfd;
+ /**
+ * add pipe fd into wait list, this pipe is used to
+ * rebuild the wait list.
+ */
+ if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
+ &pipe_event) < 0) {
+ rte_panic("Error adding fd to %d epoll_ctl, %s\n",
+ intr_pipe.readfd, strerror(errno));
+ }
+ numfds++;
+
+ rte_spinlock_lock(&intr_lock);
+
+ TAILQ_FOREACH(src, &intr_sources, next) {
+ struct epoll_event ev;
+
+ if (src->callbacks.tqh_first == NULL)
+ continue; /* skip those with no callbacks */
+ memset(&ev, 0, sizeof(ev));
+ ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
+ ev.data.fd = src->intr_handle.fd;
+
+ /**
+ * add all the uio device file descriptor
+ * into wait list.
+ */
+ if (epoll_ctl(pfd, EPOLL_CTL_ADD,
+ src->intr_handle.fd, &ev) < 0){
+ rte_panic("Error adding fd %d epoll_ctl, %s\n",
+ src->intr_handle.fd, strerror(errno));
+ }
+ else
+ numfds++;
+ }
+ rte_spinlock_unlock(&intr_lock);
+ /* serve the interrupt */
+ eal_intr_handle_interrupts(pfd, numfds);
+
+ /**
+ * when we return, we need to rebuild the
+ * list of fds to monitor.
+ */
+ close(pfd);
+ }
+}
+
+int
+rte_eal_intr_init(void)
+{
+ int ret = 0;
+
+ /* init the global interrupt source head */
+ TAILQ_INIT(&intr_sources);
+
+ /**
+ * create a pipe which will be waited by epoll and notified to
+ * rebuild the wait list of epoll.
+ */
+ if (pipe(intr_pipe.pipefd) < 0) {
+ rte_errno = errno;
+ return -1;
+ }
+
+ /* create the host thread to wait/handle the interrupt */
+ ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
+ eal_intr_thread_main, NULL);
+ if (ret != 0) {
+ rte_errno = -ret;
+ RTE_LOG(ERR, EAL,
+ "Failed to create thread for interrupt handling\n");
+ }
+
+ return ret;
+}
+
+static void
+eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
+{
+ union rte_intr_read_buffer buf;
+ int bytes_read = 0;
+ int nbytes;
+
+ switch (intr_handle->type) {
+ case RTE_INTR_HANDLE_UIO:
+ case RTE_INTR_HANDLE_UIO_INTX:
+ bytes_read = sizeof(buf.uio_intr_count);
+ break;
+#ifdef VFIO_PRESENT
+ case RTE_INTR_HANDLE_VFIO_MSIX:
+ case RTE_INTR_HANDLE_VFIO_MSI:
+ case RTE_INTR_HANDLE_VFIO_LEGACY:
+ bytes_read = sizeof(buf.vfio_intr_count);
+ break;
+#endif
+ case RTE_INTR_HANDLE_VDEV:
+ bytes_read = intr_handle->efd_counter_size;
+ /* For vdev, number of bytes to read is set by driver */
+ break;
+ case RTE_INTR_HANDLE_EXT:
+ return;
+ default:
+ bytes_read = 1;
+ RTE_LOG(INFO, EAL, "unexpected intr type\n");
+ break;
+ }
+
+ /**
+ * read out to clear the ready-to-be-read flag
+ * for epoll_wait.
+ */
+ if (bytes_read == 0)
+ return;
+ do {
+ nbytes = read(fd, &buf, bytes_read);
+ if (nbytes < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK ||
+ errno == EAGAIN)
+ continue;
+ RTE_LOG(ERR, EAL,
+ "Error reading from fd %d: %s\n",
+ fd, strerror(errno));
+ } else if (nbytes == 0)
+ RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
+ return;
+ } while (1);
+}
+
+static int
+eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
+ struct rte_epoll_event *events)
+{
+ unsigned int i, count = 0;
+ struct rte_epoll_event *rev;
+
+ for (i = 0; i < n; i++) {
+ rev = evs[i].data.ptr;
+ if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+ RTE_EPOLL_EXEC))
+ continue;
+
+ events[count].status = RTE_EPOLL_VALID;
+ events[count].fd = rev->fd;
+ events[count].epfd = rev->epfd;
+ events[count].epdata.event = rev->epdata.event;
+ events[count].epdata.data = rev->epdata.data;
+ if (rev->epdata.cb_fun)
+ rev->epdata.cb_fun(rev->fd,
+ rev->epdata.cb_arg);
+
+ rte_compiler_barrier();
+ rev->status = RTE_EPOLL_VALID;
+ count++;
+ }
+ return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+ int pfd = epoll_create(255);
+
+ if (pfd < 0) {
+ RTE_LOG(ERR, EAL,
+ "Cannot create epoll instance\n");
+ return -1;
+ }
+ return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+ if (RTE_PER_LCORE(_epfd) == -1)
+ RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+ return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+ int maxevents, int timeout)
+{
+ struct epoll_event evs[maxevents];
+ int rc;
+
+ if (!events) {
+ RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+ return -1;
+ }
+
+ /* using per thread epoll fd */
+ if (epfd == RTE_EPOLL_PER_THREAD)
+ epfd = rte_intr_tls_epfd();
+
+ while (1) {
+ rc = epoll_wait(epfd, evs, maxevents, timeout);
+ if (likely(rc > 0)) {
+ /* epoll_wait has at least one fd ready to read */
+ rc = eal_epoll_process_event(evs, rc, events);
+ break;
+ } else if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+ /* epoll_wait fail */
+ RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+ strerror(errno));
+ rc = -1;
+ break;
+ } else {
+ /* rc == 0, epoll_wait timed out */
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+ while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+ RTE_EPOLL_INVALID))
+ while (ev->status != RTE_EPOLL_VALID)
+ rte_pause();
+ memset(&ev->epdata, 0, sizeof(ev->epdata));
+ ev->fd = -1;
+ ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+ struct rte_epoll_event *event)
+{
+ struct epoll_event ev;
+
+ if (!event) {
+ RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+ return -1;
+ }
+
+ /* using per thread epoll fd */
+ if (epfd == RTE_EPOLL_PER_THREAD)
+ epfd = rte_intr_tls_epfd();
+
+ if (op == EPOLL_CTL_ADD) {
+ event->status = RTE_EPOLL_VALID;
+ event->fd = fd; /* ignore fd in event */
+ event->epfd = epfd;
+ ev.data.ptr = (void *)event;
+ }
+
+ ev.events = event->epdata.event;
+ if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+ RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+ op, fd, strerror(errno));
+ if (op == EPOLL_CTL_ADD)
+ /* rollback status when CTL_ADD fail */
+ event->status = RTE_EPOLL_INVALID;
+ return -1;
+ }
+
+ if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+ eal_epoll_data_safe_free(event);
+
+ return 0;
+}
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+ int op, unsigned int vec, void *data)
+{
+ struct rte_epoll_event *rev;
+ struct rte_epoll_data *epdata;
+ int epfd_op;
+ unsigned int efd_idx;
+ int rc = 0;
+
+ efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+ (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+
+ if (!intr_handle || intr_handle->nb_efd == 0 ||
+ efd_idx >= intr_handle->nb_efd) {
+ RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+ return -EPERM;
+ }
+
+ switch (op) {
+ case RTE_INTR_EVENT_ADD:
+ epfd_op = EPOLL_CTL_ADD;
+ rev = &intr_handle->elist[efd_idx];
+ if (rev->status != RTE_EPOLL_INVALID) {
+ RTE_LOG(INFO, EAL, "Event already been added.\n");
+ return -EEXIST;
+ }
+
+ /* attach to intr vector fd */
+ epdata = &rev->epdata;
+ epdata->event = EPOLLIN | EPOLLPRI | EPOLLET;
+ epdata->data = data;
+ epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+ epdata->cb_arg = (void *)intr_handle;
+ rc = rte_epoll_ctl(epfd, epfd_op,
+ intr_handle->efds[efd_idx], rev);
+ if (!rc)
+ RTE_LOG(DEBUG, EAL,
+ "efd %d associated with vec %d added on epfd %d"
+ "\n", rev->fd, vec, epfd);
+ else
+ rc = -EPERM;
+ break;
+ case RTE_INTR_EVENT_DEL:
+ epfd_op = EPOLL_CTL_DEL;
+ rev = &intr_handle->elist[efd_idx];
+ if (rev->status == RTE_EPOLL_INVALID) {
+ RTE_LOG(INFO, EAL, "Event does not exist.\n");
+ return -EPERM;
+ }
+
+ rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
+ if (rc)
+ rc = -EPERM;
+ break;
+ default:
+ RTE_LOG(ERR, EAL, "event op type mismatch\n");
+ rc = -EPERM;
+ }
+
+ return rc;
+}
+
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
+{
+ uint32_t i;
+ struct rte_epoll_event *rev;
+
+ for (i = 0; i < intr_handle->nb_efd; i++) {
+ rev = &intr_handle->elist[i];
+ if (rev->status == RTE_EPOLL_INVALID)
+ continue;
+ if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+ /* force free if the entry valid */
+ eal_epoll_data_safe_free(rev);
+ rev->status = RTE_EPOLL_INVALID;
+ }
+ }
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+ uint32_t i;
+ int fd;
+ uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+ assert(nb_efd != 0);
+
+ if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
+ for (i = 0; i < n; i++) {
+ fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL,
+ "can't setup eventfd, error %i (%s)\n",
+ errno, strerror(errno));
+ return -errno;
+ }
+ intr_handle->efds[i] = fd;
+ }
+ intr_handle->nb_efd = n;
+ intr_handle->max_intr = NB_OTHER_INTR + n;
+ } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
+ /* only check, initialization would be done in vdev driver.*/
+ if (intr_handle->efd_counter_size >
+ sizeof(union rte_intr_read_buffer)) {
+ RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
+ return -EINVAL;
+ }
+ } else {
+ intr_handle->efds[0] = intr_handle->fd;
+ intr_handle->nb_efd = RTE_MIN(nb_efd, 1U);
+ intr_handle->max_intr = NB_OTHER_INTR;
+ }
+
+ return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+ uint32_t i;
+
+ rte_intr_free_epoll_fd(intr_handle);
+ if (intr_handle->max_intr > intr_handle->nb_efd) {
+ for (i = 0; i < intr_handle->nb_efd; i++)
+ close(intr_handle->efds[i]);
+ }
+ intr_handle->nb_efd = 0;
+ intr_handle->max_intr = 0;
+}
+
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+ return !(!intr_handle->nb_efd);
+}
+
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+ if (!rte_intr_dp_is_en(intr_handle))
+ return 1;
+ else
+ return !!(intr_handle->max_intr - intr_handle->nb_efd);
+}
+
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
+{
+ if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
+ return 1;
+
+ if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
+ return 1;
+
+ return 0;
+}
+
+int rte_thread_is_intr(void)
+{
+ return pthread_equal(intr_thread, pthread_self());
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+#include "eal_thread.h"
+
+#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u"
+#define CORE_ID_FILE "topology/core_id"
+#define NUMA_NODE_PATH "/sys/devices/system/node"
+
+/* Check if a cpu is present by the presence of the cpu information for it */
+int
+eal_cpu_detected(unsigned lcore_id)
+{
+ char path[PATH_MAX];
+ int len = snprintf(path, sizeof(path), SYS_CPU_DIR
+ "/"CORE_ID_FILE, lcore_id);
+ if (len <= 0 || (unsigned)len >= sizeof(path))
+ return 0;
+ if (access(path, F_OK) != 0)
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Get CPU socket id (NUMA node) for a logical core.
+ *
+ * This searches each nodeX directories in /sys for the symlink for the given
+ * lcore_id and returns the numa node where the lcore is found. If lcore is not
+ * found on any numa node, returns zero.
+ */
+unsigned
+eal_cpu_socket_id(unsigned lcore_id)
+{
+ unsigned socket;
+
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
+ socket, lcore_id);
+ if (access(path, F_OK) == 0)
+ return socket;
+ }
+ return 0;
+}
+
+/* Get the cpu core id value from the /sys/.../cpuX core_id value */
+unsigned
+eal_cpu_core_id(unsigned lcore_id)
+{
+ char path[PATH_MAX];
+ unsigned long id;
+
+ int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE);
+ if (len <= 0 || (unsigned)len >= sizeof(path))
+ goto err;
+ if (eal_parse_sysfs_value(path, &id) != 0)
+ goto err;
+ return (unsigned)id;
+
+err:
+ RTE_LOG(ERR, EAL, "Error reading core id value from %s "
+ "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id);
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <sys/queue.h>
+
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+
+#include "eal_private.h"
+
+/*
+ * default log function
+ */
+static ssize_t
+console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
+{
+ ssize_t ret;
+
+ /* write on stdout */
+ ret = fwrite(buf, 1, size, stdout);
+ fflush(stdout);
+
+ /* Syslog error levels are from 0 to 7, so subtract 1 to convert */
+ syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf);
+
+ return ret;
+}
+
+static cookie_io_functions_t console_log_func = {
+ .write = console_log_write,
+};
+
+/*
+ * set the log to default function, called during eal init process,
+ * once memzones are available.
+ */
+int
+rte_eal_log_init(const char *id, int facility)
+{
+ FILE *log_stream;
+
+ log_stream = fopencookie(NULL, "w+", console_log_func);
+ if (log_stream == NULL)
+ return -1;
+
+ openlog(id, LOG_NDELAY | LOG_PID, facility);
+
+ eal_log_set_default(log_stream);
+
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
+#include <linux/memfd.h>
+#define MEMFD_SUPPORTED
+#endif
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+#include <linux/falloc.h>
+#include <linux/mman.h> /* for hugetlb-related mmap flags */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_private.h"
+
+const int anonymous_hugepages_supported =
+#ifdef MAP_HUGE_SHIFT
+ 1;
+#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
+#else
+ 0;
+#define RTE_MAP_HUGE_SHIFT 26
+#endif
+
+/*
+ * we've already checked memfd support at compile-time, but we also need to
+ * check if we can create hugepage files with memfd.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+static int memfd_create_supported =
+#ifdef MFD_HUGETLB
+ 1;
+#define RTE_MFD_HUGETLB MFD_HUGETLB
+#else
+ 0;
+#define RTE_MFD_HUGETLB 4U
+#endif
+
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * we have two modes - single file segments, and file-per-page mode.
+ *
+ * for single-file segments, we use memseg_list_fd to store the segment fd,
+ * while the fds[] will not be allocated, and len will be set to 0.
+ *
+ * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
+ * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
+ *
+ * we cannot know how many pages a system will have in advance, but we do know
+ * that they come in lists, and we know lengths of these lists. so, simply store
+ * a malloc'd array of fd's indexed by list and segment index.
+ *
+ * they will be initialized at startup, and filled as we allocate/deallocate
+ * segments.
+ */
+static struct {
+ int *fds; /**< dynamically allocated array of segment lock fd's */
+ int memseg_list_fd; /**< memseg list fd */
+ int len; /**< total length of the array */
+ int count; /**< entries used in an array */
+} fd_list[RTE_MAX_MEMSEG_LISTS];
+
+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+ return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = huge_sigbus_handler;
+
+ huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+ if (huge_need_recover) {
+ sigaction(SIGBUS, &huge_action_old, NULL);
+ huge_need_recover = 0;
+ }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+check_numa(void)
+{
+ bool ret = true;
+ /* Check if kernel supports NUMA. */
+ if (numa_available() != 0) {
+ RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+ ret = false;
+ }
+ return ret;
+}
+
+static void
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+ RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ if (get_mempolicy(oldpolicy, oldmask->maskp,
+ oldmask->size + 1, 0, 0) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to get current mempolicy: %s. "
+ "Assuming MPOL_DEFAULT.\n", strerror(errno));
+ *oldpolicy = MPOL_DEFAULT;
+ }
+ RTE_LOG(DEBUG, EAL,
+ "Setting policy MPOL_PREFERRED for socket %d\n",
+ socket_id);
+ numa_set_preferred(socket_id);
+}
+
+static void
+restore_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+ RTE_LOG(DEBUG, EAL,
+ "Restoring previous memory policy: %d\n", *oldpolicy);
+ if (*oldpolicy == MPOL_DEFAULT) {
+ numa_set_localalloc();
+ } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+ oldmask->size + 1) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+ strerror(errno));
+ numa_set_localalloc();
+ }
+ numa_free_cpumask(oldmask);
+}
+#endif
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+ /* as per mmap() manpage, all page sizes are log2 of page size
+ * shifted by MAP_HUGE_SHIFT
+ */
+ int log2 = rte_log2_u64(page_sz);
+ return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, int type)
+{
+ int ret;
+
+ /* flock may be interrupted */
+ do {
+ ret = flock(fd, type | LOCK_NB);
+ } while (ret && errno == EINTR);
+
+ if (ret && errno == EWOULDBLOCK) {
+ /* couldn't lock */
+ return 0;
+ } else if (ret) {
+ RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* lock was successful */
+ return 1;
+}
+
+static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+ unsigned int list_idx __rte_unused,
+ unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+ int fd;
+ char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+ int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+
+ if (fd < 0) {
+ snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].memseg_list_fd = fd;
+ }
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+
+ if (fd < 0) {
+ snprintf(segname, sizeof(segname), "seg_%i-%i",
+ list_idx, seg_idx);
+ fd = memfd_create(segname, flags);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+ }
+ }
+ return fd;
+#endif
+ return -1;
+}
+
+static int
+get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ int fd;
+
+ /* for in-memory mode, we only make it here when we're sure we support
+ * memfd, and this is a special case.
+ */
+ if (internal_config.in_memory)
+ return get_seg_memfd(hi, list_idx, seg_idx);
+
+ if (internal_config.single_file_segments) {
+ /* create a hugepage file path */
+ eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
+
+ fd = fd_list[list_idx].memseg_list_fd;
+
+ if (fd < 0) {
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock and keep it indefinitely */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ fd_list[list_idx].memseg_list_fd = fd;
+ }
+ } else {
+ /* create a hugepage file path */
+ eal_get_hugefile_path(path, buflen, hi->hugedir,
+ list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+
+ fd = fd_list[list_idx].fds[seg_idx];
+
+ if (fd < 0) {
+ fd = open(path, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ /* take out a read lock */
+ if (lock(fd, LOCK_SH) < 0) {
+ RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+ }
+ }
+ return fd;
+}
+
+static int
+resize_hugefile_in_memory(int fd, uint64_t fa_offset,
+ uint64_t page_sz, bool grow)
+{
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /* grow or shrink the file */
+ ret = fallocate(fd, flags, fa_offset, page_sz);
+
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+static int
+resize_hugefile_in_filesystem(int fd, uint64_t fa_offset, uint64_t page_sz,
+ bool grow)
+{
+ bool again = false;
+
+ do {
+ if (fallocate_supported == 0) {
+ /* we cannot deallocate memory if fallocate() is not
+ * supported, and hugepage file is already locked at
+ * creation, so no further synchronization needed.
+ */
+
+ if (!grow) {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+ __func__);
+ return -1;
+ }
+ uint64_t new_size = fa_offset + page_sz;
+ uint64_t cur_size = get_file_size(fd);
+
+ /* fallocate isn't supported, fall back to ftruncate */
+ if (new_size > cur_size &&
+ ftruncate(fd, new_size) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ } else {
+ int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+ FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ /*
+ * technically, it is perfectly safe for both primary
+ * and secondary to grow and shrink the page files:
+ * growing the file repeatedly has no effect because
+ * a page can only be allocated once, while mmap ensures
+ * that secondaries hold on to the page even after the
+ * page itself is removed from the filesystem.
+ *
+ * however, leaving growing/shrinking to the primary
+ * tends to expose bugs in fdlist page count handling,
+ * so leave this here just in case.
+ */
+ if (rte_eal_process_type() != RTE_PROC_PRIMARY)
+ return 0;
+
+ /* grow or shrink the file */
+ ret = fallocate(fd, flags, fa_offset, page_sz);
+
+ if (ret < 0) {
+ if (fallocate_supported == -1 &&
+ errno == ENOTSUP) {
+ RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+ __func__);
+ again = true;
+ fallocate_supported = 0;
+ } else {
+ RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+ __func__,
+ strerror(errno));
+ return -1;
+ }
+ } else
+ fallocate_supported = 1;
+ }
+ } while (again);
+
+ return 0;
+}
+
+static void
+close_hugefile(int fd, char *path, int list_idx)
+{
+ /*
+ * primary process must unlink the file, but only when not in in-memory
+ * mode (as in that case there is no file to unlink).
+ */
+ if (!internal_config.in_memory &&
+ rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ unlink(path))
+ RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
+ __func__, path, strerror(errno));
+
+ close(fd);
+ fd_list[list_idx].memseg_list_fd = -1;
+}
+
+static int
+resize_hugefile(int fd, uint64_t fa_offset, uint64_t page_sz, bool grow)
+{
+ /* in-memory mode is a special case, because we can be sure that
+ * fallocate() is supported.
+ */
+ if (internal_config.in_memory)
+ return resize_hugefile_in_memory(fd, fa_offset,
+ page_sz, grow);
+
+ return resize_hugefile_in_filesystem(fd, fa_offset, page_sz,
+ grow);
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
+ struct hugepage_info *hi, unsigned int list_idx,
+ unsigned int seg_idx)
+{
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ int cur_socket_id = 0;
+#endif
+ uint64_t map_offset;
+ rte_iova_t iova;
+ void *va;
+ char path[PATH_MAX];
+ int ret = 0;
+ int fd;
+ size_t alloc_sz;
+ int flags;
+ void *new_addr;
+
+ alloc_sz = hi->hugepage_sz;
+
+ /* these are checked at init, but code analyzers don't know that */
+ if (internal_config.in_memory && !anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+ return -1;
+ }
+ if (internal_config.in_memory && !memfd_create_supported &&
+ internal_config.single_file_segments) {
+ RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+ return -1;
+ }
+
+ /* in-memory without memfd is a special case */
+ int mmap_flags;
+
+ if (internal_config.in_memory && !memfd_create_supported) {
+ const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
+ MAP_PRIVATE | MAP_ANONYMOUS;
+ int pagesz_flag;
+
+ pagesz_flag = pagesz_flags(alloc_sz);
+ fd = -1;
+ mmap_flags = in_memory_flags | pagesz_flag;
+
+ /* single-file segments codepath will never be active
+ * here because in-memory mode is incompatible with the
+ * fallback path, and it's stopped at EAL initialization
+ * stage.
+ */
+ map_offset = 0;
+ } else {
+ /* takes out a read lock on segment or segment list */
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
+ return -1;
+ }
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * alloc_sz;
+ ret = resize_hugefile(fd, map_offset, alloc_sz, true);
+ if (ret < 0)
+ goto resized;
+
+ fd_list[list_idx].count++;
+ } else {
+ map_offset = 0;
+ if (ftruncate(fd, alloc_sz) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ if (internal_config.hugepage_unlink &&
+ !internal_config.in_memory) {
+ if (unlink(path)) {
+ RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+ __func__, strerror(errno));
+ goto resized;
+ }
+ }
+ }
+ mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
+ }
+
+ /*
+ * map the segment, and populate page tables, the kernel fills
+ * this segment with zeros if it's a new page.
+ */
+ va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+ map_offset);
+
+ if (va == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+ strerror(errno));
+ /* mmap failed, but the previous region might have been
+ * unmapped anyway. try to remap it
+ */
+ goto unmapped;
+ }
+ if (va != addr) {
+ RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+ munmap(va, alloc_sz);
+ goto resized;
+ }
+
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+ (unsigned int)(alloc_sz >> 20));
+ goto mapped;
+ }
+
+ /* we need to trigger a write to the page to enforce page fault and
+ * ensure that page is accessible to us, but we can't overwrite value
+ * that is already there, so read the old value, and write itback.
+ * kernel populates the page with zeroes initially.
+ */
+ *(volatile int *)addr = *(volatile int *)addr;
+
+ iova = rte_mem_virt2iova(addr);
+ if (iova == RTE_BAD_PHYS_ADDR) {
+ RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+ __func__);
+ goto mapped;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ ret = get_mempolicy(&cur_socket_id, NULL, 0, addr,
+ MPOL_F_NODE | MPOL_F_ADDR);
+ if (ret < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): get_mempolicy: %s\n",
+ __func__, strerror(errno));
+ goto mapped;
+ } else if (cur_socket_id != socket_id) {
+ RTE_LOG(DEBUG, EAL,
+ "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+ __func__, socket_id, cur_socket_id);
+ goto mapped;
+ }
+#else
+ if (rte_socket_count() > 1)
+ RTE_LOG(DEBUG, EAL, "%s(): not checking hugepage NUMA node.\n",
+ __func__);
+#endif
+
+ ms->addr = addr;
+ ms->hugepage_sz = alloc_sz;
+ ms->len = alloc_sz;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+ ms->iova = iova;
+ ms->socket_id = socket_id;
+
+ return 0;
+
+mapped:
+ munmap(addr, alloc_sz);
+unmapped:
+ flags = MAP_FIXED;
+ new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
+ if (new_addr != addr) {
+ if (new_addr != NULL)
+ munmap(new_addr, alloc_sz);
+ /* we're leaving a hole in our virtual address space. if
+ * somebody else maps this hole now, we could accidentally
+ * override it in the future.
+ */
+ RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
+ }
+ /* roll back the ref count */
+ if (internal_config.single_file_segments)
+ fd_list[list_idx].count--;
+resized:
+ /* some codepaths will return negative fd, so exit early */
+ if (fd < 0)
+ return -1;
+
+ if (internal_config.single_file_segments) {
+ resize_hugefile(fd, map_offset, alloc_sz, false);
+ /* ignore failure, can't make it any worse */
+
+ /* if refcount is at zero, close the file */
+ if (fd_list[list_idx].count == 0)
+ close_hugefile(fd, path, list_idx);
+ } else {
+ /* only remove file if we can take out a write lock */
+ if (internal_config.hugepage_unlink == 0 &&
+ internal_config.in_memory == 0 &&
+ lock(fd, LOCK_EX) == 1)
+ unlink(path);
+ close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
+ }
+ return -1;
+}
+
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ uint64_t map_offset;
+ char path[PATH_MAX];
+ int fd, ret = 0;
+ bool exit_early;
+
+ /* erase page data */
+ memset(ms->addr, 0, ms->len);
+
+ if (mmap(ms->addr, ms->len, PROT_READ,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+ MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+ return -1;
+ }
+
+ exit_early = false;
+
+ /* if we're using anonymous hugepages, nothing to be done */
+ if (internal_config.in_memory && !memfd_create_supported)
+ exit_early = true;
+
+ /* if we've already unlinked the page, nothing needs to be done */
+ if (!internal_config.in_memory && internal_config.hugepage_unlink)
+ exit_early = true;
+
+ if (exit_early) {
+ memset(ms, 0, sizeof(*ms));
+ return 0;
+ }
+
+ /* if we are not in single file segments mode, we're going to unmap the
+ * segment and thus drop the lock on original fd, but hugepage dir is
+ * now locked so we can take out another one without races.
+ */
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (fd < 0)
+ return -1;
+
+ if (internal_config.single_file_segments) {
+ map_offset = seg_idx * ms->len;
+ if (resize_hugefile(fd, map_offset, ms->len, false))
+ return -1;
+
+ if (--(fd_list[list_idx].count) == 0)
+ close_hugefile(fd, path, list_idx);
+
+ ret = 0;
+ } else {
+ /* if we're able to take out a write lock, we're the last one
+ * holding onto this page.
+ */
+ if (!internal_config.in_memory) {
+ ret = lock(fd, LOCK_EX);
+ if (ret >= 0) {
+ /* no one else is using this page */
+ if (ret == 1)
+ unlink(path);
+ }
+ }
+ /* closing fd will drop the lock */
+ close(fd);
+ fd_list[list_idx].fds[seg_idx] = -1;
+ }
+
+ memset(ms, 0, sizeof(*ms));
+
+ return ret < 0 ? -1 : 0;
+}
+
+struct alloc_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg **ms;
+ size_t page_sz;
+ unsigned int segs_allocated;
+ unsigned int n_segs;
+ int socket;
+ bool exact;
+};
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct alloc_walk_param *wa = arg;
+ struct rte_memseg_list *cur_msl;
+ size_t page_sz;
+ int cur_idx, start_idx, j, dir_fd = -1;
+ unsigned int msl_idx, need, i;
+
+ if (msl->page_sz != wa->page_sz)
+ return 0;
+ if (msl->socket_id != wa->socket)
+ return 0;
+
+ page_sz = (size_t)msl->page_sz;
+
+ msl_idx = msl - mcfg->memsegs;
+ cur_msl = &mcfg->memsegs[msl_idx];
+
+ need = wa->n_segs;
+
+ /* try finding space in memseg list */
+ if (wa->exact) {
+ /* if we require exact number of pages in a list, find them */
+ cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0,
+ need);
+ if (cur_idx < 0)
+ return 0;
+ start_idx = cur_idx;
+ } else {
+ int cur_len;
+
+ /* we don't require exact number of pages, so we're going to go
+ * for best-effort allocation. that means finding the biggest
+ * unused block, and going with that.
+ */
+ cur_idx = rte_fbarray_find_biggest_free(&cur_msl->memseg_arr,
+ 0);
+ if (cur_idx < 0)
+ return 0;
+ start_idx = cur_idx;
+ /* adjust the size to possibly be smaller than original
+ * request, but do not allow it to be bigger.
+ */
+ cur_len = rte_fbarray_find_contig_free(&cur_msl->memseg_arr,
+ cur_idx);
+ need = RTE_MIN(need, (unsigned int)cur_len);
+ }
+
+ /* do not allow any page allocations during the time we're allocating,
+ * because file creation and locking operations are not atomic,
+ * and we might be the first or the last ones to use a particular page,
+ * so we need to ensure atomicity of every operation.
+ *
+ * during init, we already hold a write lock, so don't try to take out
+ * another one.
+ */
+ if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
+ dir_fd = open(wa->hi->hugedir, O_RDONLY);
+ if (dir_fd < 0) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+ __func__, wa->hi->hugedir, strerror(errno));
+ return -1;
+ }
+ /* blocking writelock */
+ if (flock(dir_fd, LOCK_EX)) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+ __func__, wa->hi->hugedir, strerror(errno));
+ close(dir_fd);
+ return -1;
+ }
+ }
+
+ for (i = 0; i < need; i++, cur_idx++) {
+ struct rte_memseg *cur;
+ void *map_addr;
+
+ cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+ map_addr = RTE_PTR_ADD(cur_msl->base_va,
+ cur_idx * page_sz);
+
+ if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
+ msl_idx, cur_idx)) {
+ RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
+ need, i);
+
+ /* if exact number wasn't requested, stop */
+ if (!wa->exact)
+ goto out;
+
+ /* clean up */
+ for (j = start_idx; j < cur_idx; j++) {
+ struct rte_memseg *tmp;
+ struct rte_fbarray *arr =
+ &cur_msl->memseg_arr;
+
+ tmp = rte_fbarray_get(arr, j);
+ rte_fbarray_set_free(arr, j);
+
+ /* free_seg may attempt to create a file, which
+ * may fail.
+ */
+ if (free_seg(tmp, wa->hi, msl_idx, j))
+ RTE_LOG(DEBUG, EAL, "Cannot free page\n");
+ }
+ /* clear the list */
+ if (wa->ms)
+ memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+ if (dir_fd >= 0)
+ close(dir_fd);
+ return -1;
+ }
+ if (wa->ms)
+ wa->ms[i] = cur;
+
+ rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+ }
+out:
+ wa->segs_allocated = i;
+ if (i > 0)
+ cur_msl->version++;
+ if (dir_fd >= 0)
+ close(dir_fd);
+ /* if we didn't allocate any segments, move on to the next list */
+ return i > 0;
+}
+
+struct free_walk_param {
+ struct hugepage_info *hi;
+ struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *found_msl;
+ struct free_walk_param *wa = arg;
+ uintptr_t start_addr, end_addr;
+ int msl_idx, seg_idx, ret, dir_fd = -1;
+
+ start_addr = (uintptr_t) msl->base_va;
+ end_addr = start_addr + msl->len;
+
+ if ((uintptr_t)wa->ms->addr < start_addr ||
+ (uintptr_t)wa->ms->addr >= end_addr)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+ /* msl is const */
+ found_msl = &mcfg->memsegs[msl_idx];
+
+ /* do not allow any page allocations during the time we're freeing,
+ * because file creation and locking operations are not atomic,
+ * and we might be the first or the last ones to use a particular page,
+ * so we need to ensure atomicity of every operation.
+ *
+ * during init, we already hold a write lock, so don't try to take out
+ * another one.
+ */
+ if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
+ dir_fd = open(wa->hi->hugedir, O_RDONLY);
+ if (dir_fd < 0) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+ __func__, wa->hi->hugedir, strerror(errno));
+ return -1;
+ }
+ /* blocking writelock */
+ if (flock(dir_fd, LOCK_EX)) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+ __func__, wa->hi->hugedir, strerror(errno));
+ close(dir_fd);
+ return -1;
+ }
+ }
+
+ found_msl->version++;
+
+ rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
+ ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
+
+ if (dir_fd >= 0)
+ close(dir_fd);
+
+ if (ret < 0)
+ return -1;
+
+ return 1;
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+ int socket, bool exact)
+{
+ int i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ bool have_numa = false;
+ int oldpolicy;
+ struct bitmask *oldmask;
+#endif
+ struct alloc_walk_param wa;
+ struct hugepage_info *hi = NULL;
+
+ memset(&wa, 0, sizeof(wa));
+
+ /* dynamic allocation not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+ if (page_sz ==
+ internal_config.hugepage_info[i].hugepage_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+ __func__);
+ return -1;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (check_numa()) {
+ oldmask = numa_allocate_nodemask();
+ prepare_numa(&oldpolicy, oldmask, socket);
+ have_numa = true;
+ }
+#endif
+
+ wa.exact = exact;
+ wa.hi = hi;
+ wa.ms = ms;
+ wa.n_segs = n_segs;
+ wa.page_sz = page_sz;
+ wa.socket = socket;
+ wa.segs_allocated = 0;
+
+ /* memalloc is locked, so it's safe to use thread-unsafe version */
+ ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
+ if (ret == 0) {
+ RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+ __func__);
+ ret = -1;
+ } else if (ret > 0) {
+ ret = (int)wa.segs_allocated;
+ }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (have_numa)
+ restore_numa(&oldpolicy, oldmask);
+#endif
+ return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+ struct rte_memseg *ms;
+ if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
+ return NULL;
+ /* return pointer to newly allocated memseg */
+ return ms;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+ int seg, ret = 0;
+
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ for (seg = 0; seg < n_segs; seg++) {
+ struct rte_memseg *cur = ms[seg];
+ struct hugepage_info *hi = NULL;
+ struct free_walk_param wa;
+ int i, walk_res;
+
+ /* if this page is marked as unfreeable, fail */
+ if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+ RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+ ret = -1;
+ continue;
+ }
+
+ memset(&wa, 0, sizeof(wa));
+
+ for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+ i++) {
+ hi = &internal_config.hugepage_info[i];
+ if (cur->hugepage_sz == hi->hugepage_sz)
+ break;
+ }
+ if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ ret = -1;
+ continue;
+ }
+
+ wa.ms = cur;
+ wa.hi = hi;
+
+ /* memalloc is locked, so it's safe to use thread-unsafe version
+ */
+ walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
+ &wa);
+ if (walk_res == 1)
+ continue;
+ if (walk_res == 0)
+ RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+ ret = -1;
+ }
+ return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+ /* dynamic free not supported in legacy mode */
+ if (internal_config.legacy_mem)
+ return -1;
+
+ return eal_memalloc_free_seg_bulk(&ms, 1);
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used, int start, int end)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int i, ret, chunk_len, diff_len;
+
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ /* we need to aggregate allocations/deallocations into bigger chunks,
+ * as we don't want to spam the user with per-page callbacks.
+ *
+ * to avoid any potential issues, we also want to trigger
+ * deallocation callbacks *before* we actually deallocate
+ * memory, so that the user application could wrap up its use
+ * before it goes away.
+ */
+
+ chunk_len = end - start;
+
+ /* find how many contiguous pages we can map/unmap for this chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_free(l_arr, start) :
+ rte_fbarray_find_contig_used(l_arr, start);
+
+ /* has to be at least one page */
+ if (diff_len < 1)
+ return -1;
+
+ diff_len = RTE_MIN(chunk_len, diff_len);
+
+ /* if we are freeing memory, notify the application */
+ if (!used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+ start_va, len);
+ }
+
+ for (i = 0; i < diff_len; i++) {
+ struct rte_memseg *p_ms, *l_ms;
+ int seg_idx = start + i;
+
+ l_ms = rte_fbarray_get(l_arr, seg_idx);
+ p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+ if (l_ms == NULL || p_ms == NULL)
+ return -1;
+
+ if (used) {
+ ret = alloc_seg(l_ms, p_ms->addr,
+ p_ms->socket_id, hi,
+ msl_idx, seg_idx);
+ if (ret < 0)
+ return -1;
+ rte_fbarray_set_used(l_arr, seg_idx);
+ } else {
+ ret = free_seg(l_ms, hi, msl_idx, seg_idx);
+ rte_fbarray_set_free(l_arr, seg_idx);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ /* if we just allocated memory, notify the application */
+ if (used) {
+ struct rte_memseg *ms;
+ void *start_va;
+ size_t len, page_sz;
+
+ ms = rte_fbarray_get(l_arr, start);
+ start_va = ms->addr;
+ page_sz = (size_t)primary_msl->page_sz;
+ len = page_sz * diff_len;
+
+ eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+ start_va, len);
+ }
+
+ /* calculate how much we can advance until next chunk */
+ diff_len = used ?
+ rte_fbarray_find_contig_used(l_arr, start) :
+ rte_fbarray_find_contig_free(l_arr, start);
+ ret = RTE_MIN(chunk_len, diff_len);
+
+ return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx, bool used)
+{
+ struct rte_fbarray *l_arr, *p_arr;
+ int p_idx, l_chunk_len, p_chunk_len, ret;
+ int start, end;
+
+ /* this is a little bit tricky, but the basic idea is - walk both lists
+ * and spot any places where there are discrepancies. walking both lists
+ * and noting discrepancies in a single go is a hard problem, so we do
+ * it in two passes - first we spot any places where allocated segments
+ * mismatch (i.e. ensure that everything that's allocated in the primary
+ * is also allocated in the secondary), and then we do it by looking at
+ * free segments instead.
+ *
+ * we also need to aggregate changes into chunks, as we have to call
+ * callbacks per allocation, not per page.
+ */
+ l_arr = &local_msl->memseg_arr;
+ p_arr = &primary_msl->memseg_arr;
+
+ if (used)
+ p_idx = rte_fbarray_find_next_used(p_arr, 0);
+ else
+ p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+ while (p_idx >= 0) {
+ int next_chunk_search_idx;
+
+ if (used) {
+ p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+ p_idx);
+ } else {
+ p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+ p_idx);
+ l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+ p_idx);
+ }
+ /* best case scenario - no differences (or bigger, which will be
+ * fixed during next iteration), look for next chunk
+ */
+ if (l_chunk_len >= p_chunk_len) {
+ next_chunk_search_idx = p_idx + p_chunk_len;
+ goto next_chunk;
+ }
+
+ /* if both chunks start at the same point, skip parts we know
+ * are identical, and sync the rest. each call to sync_chunk
+ * will only sync contiguous segments, so we need to call this
+ * until we are sure there are no more differences in this
+ * chunk.
+ */
+ start = p_idx + l_chunk_len;
+ end = p_idx + p_chunk_len;
+ do {
+ ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+ used, start, end);
+ start += ret;
+ } while (start < end && ret >= 0);
+ /* if ret is negative, something went wrong */
+ if (ret < 0)
+ return -1;
+
+ next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+ /* skip to end of this chunk */
+ if (used) {
+ p_idx = rte_fbarray_find_next_used(p_arr,
+ next_chunk_search_idx);
+ } else {
+ p_idx = rte_fbarray_find_next_free(p_arr,
+ next_chunk_search_idx);
+ }
+ }
+ return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+ struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+ unsigned int msl_idx)
+{
+ int ret, dir_fd;
+
+ /* do not allow any page allocations during the time we're allocating,
+ * because file creation and locking operations are not atomic,
+ * and we might be the first or the last ones to use a particular page,
+ * so we need to ensure atomicity of every operation.
+ */
+ dir_fd = open(hi->hugedir, O_RDONLY);
+ if (dir_fd < 0) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+ hi->hugedir, strerror(errno));
+ return -1;
+ }
+ /* blocking writelock */
+ if (flock(dir_fd, LOCK_EX)) {
+ RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+ hi->hugedir, strerror(errno));
+ close(dir_fd);
+ return -1;
+ }
+
+ /* ensure all allocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+ if (ret < 0)
+ goto fail;
+
+ /* ensure all unallocated space is the same in both lists */
+ ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+ if (ret < 0)
+ goto fail;
+
+ /* update version number */
+ local_msl->version = primary_msl->version;
+
+ close(dir_fd);
+
+ return 0;
+fail:
+ close(dir_fd);
+ return -1;
+}
+
+static int
+sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ struct hugepage_info *hi = NULL;
+ unsigned int i;
+ int msl_idx;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+ uint64_t cur_sz =
+ internal_config.hugepage_info[i].hugepage_sz;
+ uint64_t msl_sz = primary_msl->page_sz;
+ if (msl_sz == cur_sz) {
+ hi = &internal_config.hugepage_info[i];
+ break;
+ }
+ }
+ if (!hi) {
+ RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+ return -1;
+ }
+
+ /* if versions don't match, synchronize everything */
+ if (local_msl->version != primary_msl->version &&
+ sync_existing(primary_msl, local_msl, hi, msl_idx))
+ return -1;
+ return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+ /* nothing to be done in primary */
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+ return 0;
+
+ /* memalloc is locked, so it's safe to call thread-unsafe version */
+ if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
+ return -1;
+ return 0;
+}
+
+static int
+secondary_msl_create_walk(const struct rte_memseg_list *msl,
+ void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *primary_msl, *local_msl;
+ char name[PATH_MAX];
+ int msl_idx, ret;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ primary_msl = &mcfg->memsegs[msl_idx];
+ local_msl = &local_memsegs[msl_idx];
+
+ /* create distinct fbarrays for each secondary */
+ snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+ primary_msl->memseg_arr.name, getpid());
+
+ ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+ primary_msl->memseg_arr.len,
+ primary_msl->memseg_arr.elt_sz);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+ return -1;
+ }
+ local_msl->base_va = primary_msl->base_va;
+ local_msl->len = primary_msl->len;
+
+ return 0;
+}
+
+static int
+alloc_list(int list_idx, int len)
+{
+ int *data;
+ int i;
+
+ /* single-file segments mode does not need fd list */
+ if (!internal_config.single_file_segments) {
+ /* ensure we have space to store fd per each possible segment */
+ data = malloc(sizeof(int) * len);
+ if (data == NULL) {
+ RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
+ return -1;
+ }
+ /* set all fd's as invalid */
+ for (i = 0; i < len; i++)
+ data[i] = -1;
+ fd_list[list_idx].fds = data;
+ fd_list[list_idx].len = len;
+ } else {
+ fd_list[list_idx].fds = NULL;
+ fd_list[list_idx].len = 0;
+ }
+
+ fd_list[list_idx].count = 0;
+ fd_list[list_idx].memseg_list_fd = -1;
+
+ return 0;
+}
+
+static int
+fd_list_create_walk(const struct rte_memseg_list *msl,
+ void *arg __rte_unused)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ unsigned int len;
+ int msl_idx;
+
+ if (msl->external)
+ return 0;
+
+ msl_idx = msl - mcfg->memsegs;
+ len = msl->memseg_arr.len;
+
+ return alloc_list(msl_idx, len);
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* single file segments mode doesn't support individual segment fd's */
+ if (internal_config.single_file_segments)
+ return -ENOTSUP;
+
+ /* if list is not allocated, allocate it */
+ if (fd_list[list_idx].len == 0) {
+ int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+ if (alloc_list(list_idx, len) < 0)
+ return -ENOMEM;
+ }
+ fd_list[list_idx].fds[seg_idx] = fd;
+
+ return 0;
+}
+
+int
+eal_memalloc_set_seg_list_fd(int list_idx, int fd)
+{
+ /* non-single file segment mode doesn't support segment list fd's */
+ if (!internal_config.single_file_segments)
+ return -ENOTSUP;
+
+ fd_list[list_idx].memseg_list_fd = fd;
+
+ return 0;
+}
+
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+ int fd;
+
+ if (internal_config.in_memory || internal_config.no_hugetlbfs) {
+#ifndef MEMFD_SUPPORTED
+ /* in in-memory or no-huge mode, we rely on memfd support */
+ return -ENOTSUP;
+#endif
+ /* memfd supported, but hugetlbfs memfd may not be */
+ if (!internal_config.no_hugetlbfs && !memfd_create_supported)
+ return -ENOTSUP;
+ }
+
+ if (internal_config.single_file_segments) {
+ fd = fd_list[list_idx].memseg_list_fd;
+ } else if (fd_list[list_idx].len == 0) {
+ /* list not initialized */
+ fd = -1;
+ } else {
+ fd = fd_list[list_idx].fds[seg_idx];
+ }
+ if (fd < 0)
+ return -ENODEV;
+ return fd;
+}
+
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+ unsigned int i;
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+ int pagesz_flag = pagesz_flags(pagesz);
+ int flags;
+
+ flags = pagesz_flag | RTE_MFD_HUGETLB;
+ int fd = memfd_create("test", flags);
+ if (fd < 0) {
+ /* we failed - let memalloc know this isn't working */
+ if (errno == EINVAL) {
+ memfd_create_supported = 0;
+ return 0; /* not supported */
+ }
+
+ /* we got other error - something's wrong */
+ return -1; /* error */
+ }
+ close(fd);
+ return 1; /* supported */
+ }
+#endif
+ return 0; /* not supported */
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+ if (internal_config.in_memory || internal_config.no_hugetlbfs) {
+#ifndef MEMFD_SUPPORTED
+ /* in in-memory or no-huge mode, we rely on memfd support */
+ return -ENOTSUP;
+#endif
+ /* memfd supported, but hugetlbfs memfd may not be */
+ if (!internal_config.no_hugetlbfs && !memfd_create_supported)
+ return -ENOTSUP;
+ }
+
+ if (internal_config.single_file_segments) {
+ size_t pgsz = mcfg->memsegs[list_idx].page_sz;
+
+ /* segment not active? */
+ if (fd_list[list_idx].memseg_list_fd < 0)
+ return -ENOENT;
+ *offset = pgsz * seg_idx;
+ } else {
+ /* fd_list not initialized? */
+ if (fd_list[list_idx].len == 0)
+ return -ENODEV;
+
+ /* segment not active? */
+ if (fd_list[list_idx].fds[seg_idx] < 0)
+ return -ENOENT;
+ *offset = 0;
+ }
+ return 0;
+}
+
+int
+eal_memalloc_init(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+ if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
+ return -1;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+ internal_config.in_memory) {
+ int mfd_res = test_memfd_create();
+
+ if (mfd_res < 0) {
+ RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+ return -1;
+ }
+ if (mfd_res == 1)
+ RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+ else
+ RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+ /* we only support single-file segments mode with in-memory mode
+ * if we support hugetlbfs with memfd_create. this code will
+ * test if we do.
+ */
+ if (internal_config.single_file_segments &&
+ mfd_res != 1) {
+ RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
+ return -1;
+ }
+ /* this cannot ever happen but better safe than sorry */
+ if (!anonymous_hugepages_supported) {
+ RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+ return -1;
+ }
+ }
+
+ /* initialize all of the fd lists */
+ if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+ return -1;
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation.
+ * Copyright(c) 2013 6WIND S.A.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
+#include <linux/memfd.h>
+#define MEMFD_SUPPORTED
+#endif
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+
+#include "eal_private.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+#include "eal_options.h"
+
+#define PFN_MASK_SIZE 8
+
+/**
+ * @file
+ * Huge page mapping under linux
+ *
+ * To reserve a big contiguous amount of memory, we use the hugepage
+ * feature of linux. For that, we need to have hugetlbfs mounted. This
+ * code will create many files in this directory (one per page) and
+ * map them in virtual memory. For each page, we will retrieve its
+ * physical address and remap it in order to have a virtual contiguous
+ * zone as well as a physical contiguous zone.
+ */
+
+static int phys_addrs_available = -1;
+
+#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
+
+uint64_t eal_get_baseaddr(void)
+{
+ /*
+ * Linux kernel uses a really high address as starting address for
+ * serving mmaps calls. If there exists addressing limitations and IOVA
+ * mode is VA, this starting address is likely too high for those
+ * devices. However, it is possible to use a lower address in the
+ * process virtual address space as with 64 bits there is a lot of
+ * available space.
+ *
+ * Current known limitations are 39 or 40 bits. Setting the starting
+ * address at 4GB implies there are 508GB or 1020GB for mapping the
+ * available hugepages. This is likely enough for most systems, although
+ * a device with addressing limitations should call
+ * rte_mem_check_dma_mask for ensuring all memory is within supported
+ * range.
+ */
+ return 0x100000000ULL;
+}
+
+/*
+ * Get physical address of any mapped virtual address in the current process.
+ */
+phys_addr_t
+rte_mem_virt2phy(const void *virtaddr)
+{
+ int fd, retval;
+ uint64_t page, physaddr;
+ unsigned long virt_pfn;
+ int page_size;
+ off_t offset;
+
+ if (phys_addrs_available == 0)
+ return RTE_BAD_IOVA;
+
+ /* standard page size */
+ page_size = getpagesize();
+
+ fd = open("/proc/self/pagemap", O_RDONLY);
+ if (fd < 0) {
+ RTE_LOG(INFO, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
+ __func__, strerror(errno));
+ return RTE_BAD_IOVA;
+ }
+
+ virt_pfn = (unsigned long)virtaddr / page_size;
+ offset = sizeof(uint64_t) * virt_pfn;
+ if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
+ RTE_LOG(INFO, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
+ __func__, strerror(errno));
+ close(fd);
+ return RTE_BAD_IOVA;
+ }
+
+ retval = read(fd, &page, PFN_MASK_SIZE);
+ close(fd);
+ if (retval < 0) {
+ RTE_LOG(INFO, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
+ __func__, strerror(errno));
+ return RTE_BAD_IOVA;
+ } else if (retval != PFN_MASK_SIZE) {
+ RTE_LOG(INFO, EAL, "%s(): read %d bytes from /proc/self/pagemap "
+ "but expected %d:\n",
+ __func__, retval, PFN_MASK_SIZE);
+ return RTE_BAD_IOVA;
+ }
+
+ /*
+ * the pfn (page frame number) are bits 0-54 (see
+ * pagemap.txt in linux Documentation)
+ */
+ if ((page & 0x7fffffffffffffULL) == 0)
+ return RTE_BAD_IOVA;
+
+ physaddr = ((page & 0x7fffffffffffffULL) * page_size)
+ + ((unsigned long)virtaddr % page_size);
+
+ return physaddr;
+}
+
+rte_iova_t
+rte_mem_virt2iova(const void *virtaddr)
+{
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ return (uintptr_t)virtaddr;
+ return rte_mem_virt2phy(virtaddr);
+}
+
+/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value. We find
+ * it by browsing the /proc/self/pagemap special file.
+ */
+static int
+find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+ unsigned int i;
+ phys_addr_t addr;
+
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
+ if (addr == RTE_BAD_PHYS_ADDR)
+ return -1;
+ hugepg_tbl[i].physaddr = addr;
+ }
+ return 0;
+}
+
+/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
+ */
+static int
+set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+ unsigned int i;
+ static phys_addr_t addr;
+
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ hugepg_tbl[i].physaddr = addr;
+ addr += hugepg_tbl[i].size;
+ }
+ return 0;
+}
+
+/*
+ * Check whether address-space layout randomization is enabled in
+ * the kernel. This is important for multi-process as it can prevent
+ * two processes mapping data to the same virtual address
+ * Returns:
+ * 0 - address space randomization disabled
+ * 1/2 - address space randomization enabled
+ * negative error code on error
+ */
+static int
+aslr_enabled(void)
+{
+ char c;
+ int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
+ if (fd < 0)
+ return -errno;
+ retval = read(fd, &c, 1);
+ close(fd);
+ if (retval < 0)
+ return -errno;
+ if (retval == 0)
+ return -EIO;
+ switch (c) {
+ case '0' : return 0;
+ case '1' : return 1;
+ case '2' : return 2;
+ default: return -EINVAL;
+ }
+}
+
+static sigjmp_buf huge_jmpenv;
+
+static void huge_sigbus_handler(int signo __rte_unused)
+{
+ siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int huge_wrap_sigsetjmp(void)
+{
+ return sigsetjmp(huge_jmpenv, 1);
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+/* Callback for numa library. */
+void numa_error(char *where)
+{
+ RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
+}
+#endif
+
+/*
+ * Mmap all hugepages of hugepage table: it first open a file in
+ * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
+ * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
+ * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
+ * map contiguous physical blocks in contiguous virtual blocks.
+ */
+static unsigned
+map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
+ uint64_t *essential_memory __rte_unused)
+{
+ int fd;
+ unsigned i;
+ void *virtaddr;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ int node_id = -1;
+ int essential_prev = 0;
+ int oldpolicy;
+ struct bitmask *oldmask = NULL;
+ bool have_numa = true;
+ unsigned long maxnode = 0;
+
+ /* Check if kernel supports NUMA. */
+ if (numa_available() != 0) {
+ RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+ have_numa = false;
+ }
+
+ if (have_numa) {
+ RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+ oldmask = numa_allocate_nodemask();
+ if (get_mempolicy(&oldpolicy, oldmask->maskp,
+ oldmask->size + 1, 0, 0) < 0) {
+ RTE_LOG(ERR, EAL,
+ "Failed to get current mempolicy: %s. "
+ "Assuming MPOL_DEFAULT.\n", strerror(errno));
+ oldpolicy = MPOL_DEFAULT;
+ }
+ for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+ if (internal_config.socket_mem[i])
+ maxnode = i + 1;
+ }
+#endif
+
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ struct hugepage_file *hf = &hugepg_tbl[i];
+ uint64_t hugepage_sz = hpi->hugepage_sz;
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (maxnode) {
+ unsigned int j;
+
+ for (j = 0; j < maxnode; j++)
+ if (essential_memory[j])
+ break;
+
+ if (j == maxnode) {
+ node_id = (node_id + 1) % maxnode;
+ while (!internal_config.socket_mem[node_id]) {
+ node_id++;
+ node_id %= maxnode;
+ }
+ essential_prev = 0;
+ } else {
+ node_id = j;
+ essential_prev = essential_memory[j];
+
+ if (essential_memory[j] < hugepage_sz)
+ essential_memory[j] = 0;
+ else
+ essential_memory[j] -= hugepage_sz;
+ }
+
+ RTE_LOG(DEBUG, EAL,
+ "Setting policy MPOL_PREFERRED for socket %d\n",
+ node_id);
+ numa_set_preferred(node_id);
+ }
+#endif
+
+ hf->file_id = i;
+ hf->size = hugepage_sz;
+ eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
+ hpi->hugedir, hf->file_id);
+ hf->filepath[sizeof(hf->filepath) - 1] = '\0';
+
+ /* try to create hugepage file */
+ fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
+ if (fd < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+ strerror(errno));
+ goto out;
+ }
+
+ /* map the segment, and populate page tables,
+ * the kernel fills this segment with zeros. we don't care where
+ * this gets mapped - we already have contiguous memory areas
+ * ready for us to map into.
+ */
+ virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE, fd, 0);
+ if (virtaddr == MAP_FAILED) {
+ RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
+ strerror(errno));
+ close(fd);
+ goto out;
+ }
+
+ hf->orig_va = virtaddr;
+
+ /* In linux, hugetlb limitations, like cgroup, are
+ * enforced at fault time instead of mmap(), even
+ * with the option of MAP_POPULATE. Kernel will send
+ * a SIGBUS signal. To avoid to be killed, save stack
+ * environment here, if SIGBUS happens, we can jump
+ * back here.
+ */
+ if (huge_wrap_sigsetjmp()) {
+ RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+ "hugepages of size %u MB\n",
+ (unsigned int)(hugepage_sz / 0x100000));
+ munmap(virtaddr, hugepage_sz);
+ close(fd);
+ unlink(hugepg_tbl[i].filepath);
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (maxnode)
+ essential_memory[node_id] =
+ essential_prev;
+#endif
+ goto out;
+ }
+ *(int *)virtaddr = 0;
+
+ /* set shared lock on the file. */
+ if (flock(fd, LOCK_SH) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
+ __func__, strerror(errno));
+ close(fd);
+ goto out;
+ }
+
+ close(fd);
+ }
+
+out:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (maxnode) {
+ RTE_LOG(DEBUG, EAL,
+ "Restoring previous memory policy: %d\n", oldpolicy);
+ if (oldpolicy == MPOL_DEFAULT) {
+ numa_set_localalloc();
+ } else if (set_mempolicy(oldpolicy, oldmask->maskp,
+ oldmask->size + 1) < 0) {
+ RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+ strerror(errno));
+ numa_set_localalloc();
+ }
+ }
+ if (oldmask != NULL)
+ numa_free_cpumask(oldmask);
+#endif
+ return i;
+}
+
+/*
+ * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
+ * page.
+ */
+static int
+find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+ int socket_id;
+ char *end, *nodestr;
+ unsigned i, hp_count = 0;
+ uint64_t virt_addr;
+ char buf[BUFSIZ];
+ char hugedir_str[PATH_MAX];
+ FILE *f;
+
+ f = fopen("/proc/self/numa_maps", "r");
+ if (f == NULL) {
+ RTE_LOG(NOTICE, EAL, "NUMA support not available"
+ " consider that all memory is in socket_id 0\n");
+ return 0;
+ }
+
+ snprintf(hugedir_str, sizeof(hugedir_str),
+ "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
+
+ /* parse numa map */
+ while (fgets(buf, sizeof(buf), f) != NULL) {
+
+ /* ignore non huge page */
+ if (strstr(buf, " huge ") == NULL &&
+ strstr(buf, hugedir_str) == NULL)
+ continue;
+
+ /* get zone addr */
+ virt_addr = strtoull(buf, &end, 16);
+ if (virt_addr == 0 || end == buf) {
+ RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+ goto error;
+ }
+
+ /* get node id (socket id) */
+ nodestr = strstr(buf, " N");
+ if (nodestr == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+ goto error;
+ }
+ nodestr += 2;
+ end = strstr(nodestr, "=");
+ if (end == NULL) {
+ RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+ goto error;
+ }
+ end[0] = '\0';
+ end = NULL;
+
+ socket_id = strtoul(nodestr, &end, 0);
+ if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
+ RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+ goto error;
+ }
+
+ /* if we find this page in our mappings, set socket_id */
+ for (i = 0; i < hpi->num_pages[0]; i++) {
+ void *va = (void *)(unsigned long)virt_addr;
+ if (hugepg_tbl[i].orig_va == va) {
+ hugepg_tbl[i].socket_id = socket_id;
+ hp_count++;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ RTE_LOG(DEBUG, EAL,
+ "Hugepage %s is on socket %d\n",
+ hugepg_tbl[i].filepath, socket_id);
+#endif
+ }
+ }
+ }
+
+ if (hp_count < hpi->num_pages[0])
+ goto error;
+
+ fclose(f);
+ return 0;
+
+error:
+ fclose(f);
+ return -1;
+}
+
+static int
+cmp_physaddr(const void *a, const void *b)
+{
+#ifndef RTE_ARCH_PPC_64
+ const struct hugepage_file *p1 = a;
+ const struct hugepage_file *p2 = b;
+#else
+ /* PowerPC needs memory sorted in reverse order from x86 */
+ const struct hugepage_file *p1 = b;
+ const struct hugepage_file *p2 = a;
+#endif
+ if (p1->physaddr < p2->physaddr)
+ return -1;
+ else if (p1->physaddr > p2->physaddr)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+ void *retval;
+ int fd;
+
+ /* if no shared files mode is used, create anonymous memory instead */
+ if (internal_config.no_shconf) {
+ retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (retval == MAP_FAILED)
+ return NULL;
+ return retval;
+ }
+
+ fd = open(filename, O_CREAT | O_RDWR, 0600);
+ if (fd < 0)
+ return NULL;
+ if (ftruncate(fd, mem_size) < 0) {
+ close(fd);
+ return NULL;
+ }
+ retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ close(fd);
+ if (retval == MAP_FAILED)
+ return NULL;
+ return retval;
+}
+
+/*
+ * this copies *active* hugepages from one hugepage table to another.
+ * destination is typically the shared memory.
+ */
+static int
+copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
+ const struct hugepage_file * src, int src_size)
+{
+ int src_pos, dst_pos = 0;
+
+ for (src_pos = 0; src_pos < src_size; src_pos++) {
+ if (src[src_pos].orig_va != NULL) {
+ /* error on overflow attempt */
+ if (dst_pos == dest_size)
+ return -1;
+ memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
+ dst_pos++;
+ }
+ }
+ return 0;
+}
+
+static int
+unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
+ unsigned num_hp_info)
+{
+ unsigned socket, size;
+ int page, nrpages = 0;
+
+ /* get total number of hugepages */
+ for (size = 0; size < num_hp_info; size++)
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+ nrpages +=
+ internal_config.hugepage_info[size].num_pages[socket];
+
+ for (page = 0; page < nrpages; page++) {
+ struct hugepage_file *hp = &hugepg_tbl[page];
+
+ if (hp->orig_va != NULL && unlink(hp->filepath)) {
+ RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
+ __func__, hp->filepath, strerror(errno));
+ }
+ }
+ return 0;
+}
+
+/*
+ * unmaps hugepages that are not going to be used. since we originally allocate
+ * ALL hugepages (not just those we need), additional unmapping needs to be done.
+ */
+static int
+unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
+ struct hugepage_info *hpi,
+ unsigned num_hp_info)
+{
+ unsigned socket, size;
+ int page, nrpages = 0;
+
+ /* get total number of hugepages */
+ for (size = 0; size < num_hp_info; size++)
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+ nrpages += internal_config.hugepage_info[size].num_pages[socket];
+
+ for (size = 0; size < num_hp_info; size++) {
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+ unsigned pages_found = 0;
+
+ /* traverse until we have unmapped all the unused pages */
+ for (page = 0; page < nrpages; page++) {
+ struct hugepage_file *hp = &hugepg_tbl[page];
+
+ /* find a page that matches the criteria */
+ if ((hp->size == hpi[size].hugepage_sz) &&
+ (hp->socket_id == (int) socket)) {
+
+ /* if we skipped enough pages, unmap the rest */
+ if (pages_found == hpi[size].num_pages[socket]) {
+ uint64_t unmap_len;
+
+ unmap_len = hp->size;
+
+ /* get start addr and len of the remaining segment */
+ munmap(hp->orig_va,
+ (size_t)unmap_len);
+
+ hp->orig_va = NULL;
+ if (unlink(hp->filepath) == -1) {
+ RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
+ __func__, hp->filepath, strerror(errno));
+ return -1;
+ }
+ } else {
+ /* lock the page and skip */
+ pages_found++;
+ }
+
+ } /* match page */
+ } /* foreach page */
+ } /* foreach socket */
+ } /* foreach pagesize */
+
+ return 0;
+}
+
+static int
+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ struct rte_fbarray *arr;
+ int cur_page, seg_len;
+ unsigned int msl_idx;
+ int ms_idx;
+ uint64_t page_sz;
+ size_t memseg_len;
+ int socket_id;
+
+ page_sz = hugepages[seg_start].size;
+ socket_id = hugepages[seg_start].socket_id;
+ seg_len = seg_end - seg_start;
+
+ RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20ULL, socket_id);
+
+ /* find free space in memseg lists */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ bool empty;
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ if (msl->page_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket_id)
+ continue;
+
+ /* leave space for a hole if array is not empty */
+ empty = arr->count == 0;
+ ms_idx = rte_fbarray_find_next_n_free(arr, 0,
+ seg_len + (empty ? 0 : 1));
+
+ /* memseg list is full? */
+ if (ms_idx < 0)
+ continue;
+
+ /* leave some space between memsegs, they are not IOVA
+ * contiguous, so they shouldn't be VA contiguous either.
+ */
+ if (!empty)
+ ms_idx++;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+ RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+ return -1;
+ }
+
+#ifdef RTE_ARCH_PPC_64
+ /* for PPC64 we go through the list backwards */
+ for (cur_page = seg_end - 1; cur_page >= seg_start;
+ cur_page--, ms_idx++) {
+#else
+ for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
+#endif
+ struct hugepage_file *hfile = &hugepages[cur_page];
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *addr;
+ int fd;
+
+ fd = open(hfile->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ return -1;
+ }
+ /* set shared lock on the file. */
+ if (flock(fd, LOCK_SH) < 0) {
+ RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ memseg_len = (size_t)page_sz;
+ addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+
+ /* we know this address is already mmapped by memseg list, so
+ * using MAP_FIXED here is safe
+ */
+ addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+ if (addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
+ hfile->filepath, strerror(errno));
+ close(fd);
+ return -1;
+ }
+
+ /* we have a new address, so unmap previous one */
+#ifndef RTE_ARCH_64
+ /* in 32-bit legacy mode, we have already unmapped the page */
+ if (!internal_config.legacy_mem)
+ munmap(hfile->orig_va, page_sz);
+#else
+ munmap(hfile->orig_va, page_sz);
+#endif
+
+ hfile->orig_va = NULL;
+ hfile->final_va = addr;
+
+ /* rewrite physical addresses in IOVA as VA mode */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ hfile->physaddr = (uintptr_t)addr;
+
+ /* set up memseg data */
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->len = memseg_len;
+ ms->iova = hfile->physaddr;
+ ms->socket_id = hfile->socket_id;
+ ms->nchannel = rte_memory_get_nchannel();
+ ms->nrank = rte_memory_get_nrank();
+
+ rte_fbarray_set_used(arr, ms_idx);
+
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
+ }
+ RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
+ (seg_len * page_sz) >> 20, socket_id);
+ return 0;
+}
+
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+ uint64_t area_sz, max_pages;
+
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+ max_pages = RTE_MAX_MEMSEG_PER_LIST;
+ max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+ area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+ /* make sure the list isn't smaller than the page size */
+ area_sz = RTE_MAX(area_sz, page_sz);
+
+ return RTE_ALIGN(area_sz, page_sz);
+}
+
+static int
+free_memseg_list(struct rte_memseg_list *msl)
+{
+ if (rte_fbarray_destroy(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n");
+ return -1;
+ }
+ memset(msl, 0, sizeof(*msl));
+ return 0;
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int n_segs, int socket_id, int type_msl_idx)
+{
+ char name[RTE_FBARRAY_NAME_LEN];
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+ type_msl_idx);
+ if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+ rte_strerror(rte_errno));
+ return -1;
+ }
+
+ msl->page_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = NULL;
+ msl->heap = 1; /* mark it as a heap segment */
+
+ RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+ (size_t)page_sz >> 10, socket_id);
+
+ return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+ uint64_t page_sz;
+ size_t mem_sz;
+ void *addr;
+ int flags = 0;
+
+ page_sz = msl->page_sz;
+ mem_sz = page_sz * msl->memseg_arr.len;
+
+ addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+ if (addr == NULL) {
+ if (rte_errno == EADDRNOTAVAIL)
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - "
+ "please use '--" OPT_BASE_VIRTADDR "' option\n",
+ (unsigned long long)mem_sz, msl->base_va);
+ else
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->len = mem_sz;
+
+ return 0;
+}
+
+/*
+ * Our VA space is not preallocated yet, so preallocate it here. We need to know
+ * how many segments there are in order to map all pages into one address space,
+ * and leave appropriate holes between segments so that rte_malloc does not
+ * concatenate them into one big segment.
+ *
+ * we also need to unmap original pages to free up address space.
+ */
+static int __rte_unused
+prealloc_segments(struct hugepage_file *hugepages, int n_pages)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int cur_page, seg_start_page, end_seg, new_memseg;
+ unsigned int hpi_idx, socket, i;
+ int n_contig_segs, n_segs;
+ int msl_idx;
+
+ /* before we preallocate segments, we need to free up our VA space.
+ * we're not removing files, and we already have information about
+ * PA-contiguousness, so it is safe to unmap everything.
+ */
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *hpi = &hugepages[cur_page];
+ munmap(hpi->orig_va, hpi->size);
+ hpi->orig_va = NULL;
+ }
+
+ /* we cannot know how many page sizes and sockets we have discovered, so
+ * loop over all of them
+ */
+ for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ uint64_t page_sz =
+ internal_config.hugepage_info[hpi_idx].hugepage_sz;
+
+ for (i = 0; i < rte_socket_count(); i++) {
+ struct rte_memseg_list *msl;
+
+ socket = rte_socket_id_by_idx(i);
+ n_contig_segs = 0;
+ n_segs = 0;
+ seg_start_page = -1;
+
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+ int prev_seg_start_page = -1;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL :
+ &hugepages[cur_page - 1];
+
+ new_memseg = 0;
+ end_seg = 0;
+
+ if (cur->size == 0)
+ end_seg = 1;
+ else if (cur->socket_id != (int) socket)
+ end_seg = 1;
+ else if (cur->size != page_sz)
+ end_seg = 1;
+ else if (cur_page == 0)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start
+ * from higher address to lower address. Here,
+ * physical addresses are in descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) !=
+ cur->size)
+ new_memseg = 1;
+#endif
+ if (new_memseg) {
+ /* if we're already inside a segment,
+ * new segment means end of current one
+ */
+ if (seg_start_page != -1) {
+ end_seg = 1;
+ prev_seg_start_page =
+ seg_start_page;
+ }
+ seg_start_page = cur_page;
+ }
+
+ if (end_seg) {
+ if (prev_seg_start_page != -1) {
+ /* we've found a new segment */
+ n_contig_segs++;
+ n_segs += cur_page -
+ prev_seg_start_page;
+ } else if (seg_start_page != -1) {
+ /* we didn't find new segment,
+ * but did end current one
+ */
+ n_contig_segs++;
+ n_segs += cur_page -
+ seg_start_page;
+ seg_start_page = -1;
+ continue;
+ } else {
+ /* we're skipping this page */
+ continue;
+ }
+ }
+ /* segment continues */
+ }
+ /* check if we missed last segment */
+ if (seg_start_page != -1) {
+ n_contig_segs++;
+ n_segs += cur_page - seg_start_page;
+ }
+
+ /* if no segments were found, do not preallocate */
+ if (n_segs == 0)
+ continue;
+
+ /* we now have total number of pages that we will
+ * allocate for this segment list. add separator pages
+ * to the total count, and preallocate VA space.
+ */
+ n_segs += n_contig_segs - 1;
+
+ /* now, preallocate VA space for these segments */
+
+ /* first, find suitable memseg list for this */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+ msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+
+ if (msl->base_va != NULL)
+ continue;
+ break;
+ }
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ /* now, allocate fbarray itself */
+ if (alloc_memseg_list(msl, page_sz, n_segs, socket,
+ msl_idx) < 0)
+ return -1;
+
+ /* finally, allocate VA space */
+ if (alloc_va_space(msl) < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages
+ * backwards, therefore we have to process the entire memseg first before
+ * remapping it into memseg list VA space.
+ */
+static int
+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
+{
+ int cur_page, seg_start_page, new_memseg, ret;
+
+ seg_start_page = 0;
+ for (cur_page = 0; cur_page < n_pages; cur_page++) {
+ struct hugepage_file *prev, *cur;
+
+ new_memseg = 0;
+
+ cur = &hugepages[cur_page];
+ prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
+
+ /* if size is zero, no more pages left */
+ if (cur->size == 0)
+ break;
+
+ if (cur_page == 0)
+ new_memseg = 1;
+ else if (cur->socket_id != prev->socket_id)
+ new_memseg = 1;
+ else if (cur->size != prev->size)
+ new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+ /* On PPC64 architecture, the mmap always start from higher
+ * address to lower address. Here, physical addresses are in
+ * descending order.
+ */
+ else if ((prev->physaddr - cur->physaddr) != cur->size)
+ new_memseg = 1;
+#else
+ else if ((cur->physaddr - prev->physaddr) != cur->size)
+ new_memseg = 1;
+#endif
+
+ if (new_memseg) {
+ /* if this isn't the first time, remap segment */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ /* remember where we started */
+ seg_start_page = cur_page;
+ }
+ /* continuation of previous memseg */
+ }
+ /* we were stopped, but we didn't remap the last segment, do it now */
+ if (cur_page != 0) {
+ ret = remap_segment(hugepages, seg_start_page,
+ cur_page);
+ if (ret != 0)
+ return -1;
+ }
+ return 0;
+}
+
+__rte_unused /* function is unused on 32-bit builds */
+static inline uint64_t
+get_socket_mem_size(int socket)
+{
+ uint64_t size = 0;
+ unsigned i;
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++){
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+ size += hpi->hugepage_sz * hpi->num_pages[socket];
+ }
+
+ return size;
+}
+
+/*
+ * This function is a NUMA-aware equivalent of calc_num_pages.
+ * It takes in the list of hugepage sizes and the
+ * number of pages thereof, and calculates the best number of
+ * pages of each size to fulfill the request for <memory> ram
+ */
+static int
+calc_num_pages_per_socket(uint64_t * memory,
+ struct hugepage_info *hp_info,
+ struct hugepage_info *hp_used,
+ unsigned num_hp_info)
+{
+ unsigned socket, j, i = 0;
+ unsigned requested, available;
+ int total_num_pages = 0;
+ uint64_t remaining_mem, cur_mem;
+ uint64_t total_mem = internal_config.memory;
+
+ if (num_hp_info == 0)
+ return -1;
+
+ /* if specific memory amounts per socket weren't requested */
+ if (internal_config.force_sockets == 0) {
+ size_t total_size;
+#ifdef RTE_ARCH_64
+ int cpu_per_socket[RTE_MAX_NUMA_NODES];
+ size_t default_size;
+ unsigned lcore_id;
+
+ /* Compute number of cores per socket */
+ memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
+ RTE_LCORE_FOREACH(lcore_id) {
+ cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
+ }
+
+ /*
+ * Automatically spread requested memory amongst detected sockets according
+ * to number of cores from cpu mask present on each socket
+ */
+ total_size = internal_config.memory;
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+
+ /* Set memory amount per socket */
+ default_size = (internal_config.memory * cpu_per_socket[socket])
+ / rte_lcore_count();
+
+ /* Limit to maximum available memory on socket */
+ default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
+
+ /* Update sizes */
+ memory[socket] = default_size;
+ total_size -= default_size;
+ }
+
+ /*
+ * If some memory is remaining, try to allocate it by getting all
+ * available memory from sockets, one after the other
+ */
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+ /* take whatever is available */
+ default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
+ total_size);
+
+ /* Update sizes */
+ memory[socket] += default_size;
+ total_size -= default_size;
+ }
+#else
+ /* in 32-bit mode, allocate all of the memory only on master
+ * lcore socket
+ */
+ total_size = internal_config.memory;
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+ socket++) {
+ struct rte_config *cfg = rte_eal_get_configuration();
+ unsigned int master_lcore_socket;
+
+ master_lcore_socket =
+ rte_lcore_to_socket_id(cfg->master_lcore);
+
+ if (master_lcore_socket != socket)
+ continue;
+
+ /* Update sizes */
+ memory[socket] = total_size;
+ break;
+ }
+#endif
+ }
+
+ for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
+ /* skips if the memory on specific socket wasn't requested */
+ for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
+ strlcpy(hp_used[i].hugedir, hp_info[i].hugedir,
+ sizeof(hp_used[i].hugedir));
+ hp_used[i].num_pages[socket] = RTE_MIN(
+ memory[socket] / hp_info[i].hugepage_sz,
+ hp_info[i].num_pages[socket]);
+
+ cur_mem = hp_used[i].num_pages[socket] *
+ hp_used[i].hugepage_sz;
+
+ memory[socket] -= cur_mem;
+ total_mem -= cur_mem;
+
+ total_num_pages += hp_used[i].num_pages[socket];
+
+ /* check if we have met all memory requests */
+ if (memory[socket] == 0)
+ break;
+
+ /* check if we have any more pages left at this size, if so
+ * move on to next size */
+ if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
+ continue;
+ /* At this point we know that there are more pages available that are
+ * bigger than the memory we want, so lets see if we can get enough
+ * from other page sizes.
+ */
+ remaining_mem = 0;
+ for (j = i+1; j < num_hp_info; j++)
+ remaining_mem += hp_info[j].hugepage_sz *
+ hp_info[j].num_pages[socket];
+
+ /* is there enough other memory, if not allocate another page and quit */
+ if (remaining_mem < memory[socket]){
+ cur_mem = RTE_MIN(memory[socket],
+ hp_info[i].hugepage_sz);
+ memory[socket] -= cur_mem;
+ total_mem -= cur_mem;
+ hp_used[i].num_pages[socket]++;
+ total_num_pages++;
+ break; /* we are done with this socket*/
+ }
+ }
+ /* if we didn't satisfy all memory requirements per socket */
+ if (memory[socket] > 0 &&
+ internal_config.socket_mem[socket] != 0) {
+ /* to prevent icc errors */
+ requested = (unsigned) (internal_config.socket_mem[socket] /
+ 0x100000);
+ available = requested -
+ ((unsigned) (memory[socket] / 0x100000));
+ RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
+ "Requested: %uMB, available: %uMB\n", socket,
+ requested, available);
+ return -1;
+ }
+ }
+
+ /* if we didn't satisfy total memory requirements */
+ if (total_mem > 0) {
+ requested = (unsigned) (internal_config.memory / 0x100000);
+ available = requested - (unsigned) (total_mem / 0x100000);
+ RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
+ " available: %uMB\n", requested, available);
+ return -1;
+ }
+ return total_num_pages;
+}
+
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+ uint64_t size = 0;
+ unsigned i, j;
+
+ for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+ struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+ if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ size += hpi->hugepage_sz * hpi->num_pages[j];
+ }
+ }
+ }
+
+ return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void
+huge_register_sigbus(void)
+{
+ sigset_t mask;
+ struct sigaction action;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ action.sa_flags = 0;
+ action.sa_mask = mask;
+ action.sa_handler = huge_sigbus_handler;
+
+ huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void
+huge_recover_sigbus(void)
+{
+ if (huge_need_recover) {
+ sigaction(SIGBUS, &huge_action_old, NULL);
+ huge_need_recover = 0;
+ }
+}
+
+/*
+ * Prepare physical memory mapping: fill configuration structure with
+ * these infos, return 0 on success.
+ * 1. map N huge pages in separate files in hugetlbfs
+ * 2. find associated physical addr
+ * 3. find associated NUMA socket ID
+ * 4. sort all huge pages by physical address
+ * 5. remap these N huge pages in the correct order
+ * 6. unmap the first mapping
+ * 7. fill memsegs in configuration with contiguous zones
+ */
+static int
+eal_legacy_hugepage_init(void)
+{
+ struct rte_mem_config *mcfg;
+ struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
+
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+
+ unsigned hp_offset;
+ int i, j;
+ int nr_hugefiles, nr_hugepages = 0;
+ void *addr;
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ /* get pointer to global configuration */
+ mcfg = rte_eal_get_configuration()->mem_config;
+
+ /* hugetlbfs can be disabled */
+ if (internal_config.no_hugetlbfs) {
+ void *prealloc_addr;
+ size_t mem_sz;
+ struct rte_memseg_list *msl;
+ int n_segs, cur_seg, fd, flags;
+#ifdef MEMFD_SUPPORTED
+ int memfd;
+#endif
+ uint64_t page_sz;
+
+ /* nohuge mode is legacy mode */
+ internal_config.legacy_mem = 1;
+
+ /* nohuge mode is single-file segments mode */
+ internal_config.single_file_segments = 1;
+
+ /* create a memseg list */
+ msl = &mcfg->memsegs[0];
+
+ page_sz = RTE_PGSIZE_4K;
+ n_segs = internal_config.memory / page_sz;
+
+ if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ /* set up parameters for anonymous mmap */
+ fd = -1;
+ flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+#ifdef MEMFD_SUPPORTED
+ /* create a memfd and store it in the segment fd table */
+ memfd = memfd_create("nohuge", 0);
+ if (memfd < 0) {
+ RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
+ strerror(errno));
+ RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
+ } else {
+ /* we got an fd - now resize it */
+ if (ftruncate(memfd, internal_config.memory) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
+ strerror(errno));
+ RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
+ close(memfd);
+ } else {
+ /* creating memfd-backed file was successful.
+ * we want changes to memfd to be visible to
+ * other processes (such as vhost backend), so
+ * map it as shared memory.
+ */
+ RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+ fd = memfd;
+ flags = MAP_SHARED;
+ }
+ }
+#endif
+ /* preallocate address space for the memory, so that it can be
+ * fit into the DMA mask.
+ */
+ mem_sz = internal_config.memory;
+ prealloc_addr = eal_get_virtual_area(
+ NULL, &mem_sz, page_sz, 0, 0);
+ if (prealloc_addr == NULL) {
+ RTE_LOG(ERR, EAL,
+ "%s: reserving memory area failed: "
+ "%s\n",
+ __func__, strerror(errno));
+ return -1;
+ }
+ addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE,
+ flags | MAP_FIXED, fd, 0);
+ if (addr == MAP_FAILED || addr != prealloc_addr) {
+ RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+ strerror(errno));
+ munmap(prealloc_addr, mem_sz);
+ return -1;
+ }
+ msl->base_va = addr;
+ msl->page_sz = page_sz;
+ msl->socket_id = 0;
+ msl->len = mem_sz;
+ msl->heap = 1;
+
+ /* we're in single-file segments mode, so only the segment list
+ * fd needs to be set up.
+ */
+ if (fd != -1) {
+ if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
+ /* not a serious error, proceed */
+ }
+ }
+
+ /* populate memsegs. each memseg is one page long */
+ for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+ arr = &msl->memseg_arr;
+
+ ms = rte_fbarray_get(arr, cur_seg);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA)
+ ms->iova = (uintptr_t)addr;
+ else
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = page_sz;
+ ms->socket_id = 0;
+ ms->len = page_sz;
+
+ rte_fbarray_set_used(arr, cur_seg);
+
+ addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+ }
+ if (mcfg->dma_maskbits &&
+ rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+ RTE_LOG(ERR, EAL,
+ "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
+ __func__);
+ if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+ rte_eal_using_phys_addrs())
+ RTE_LOG(ERR, EAL,
+ "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
+ __func__);
+ goto fail;
+ }
+ return 0;
+ }
+
+ /* calculate total number of hugepages available. at this point we haven't
+ * yet started sorting them so they all are on socket 0 */
+ for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+ /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
+ used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
+
+ nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
+ }
+
+ /*
+ * allocate a memory area for hugepage table.
+ * this isn't shared memory yet. due to the fact that we need some
+ * processing done on these pages, shared memory will be created
+ * at a later stage.
+ */
+ tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
+ if (tmp_hp == NULL)
+ goto fail;
+
+ memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
+
+ hp_offset = 0; /* where we start the current page size entries */
+
+ huge_register_sigbus();
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+ memory[i] = internal_config.socket_mem[i];
+
+ /* map all hugepages and sort them */
+ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+ unsigned pages_old, pages_new;
+ struct hugepage_info *hpi;
+
+ /*
+ * we don't yet mark hugepages as used at this stage, so
+ * we just map all hugepages available to the system
+ * all hugepages are still located on socket 0
+ */
+ hpi = &internal_config.hugepage_info[i];
+
+ if (hpi->num_pages[0] == 0)
+ continue;
+
+ /* map all hugepages available */
+ pages_old = hpi->num_pages[0];
+ pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
+ if (pages_new < pages_old) {
+ RTE_LOG(DEBUG, EAL,
+ "%d not %d hugepages of size %u MB allocated\n",
+ pages_new, pages_old,
+ (unsigned)(hpi->hugepage_sz / 0x100000));
+
+ int pages = pages_old - pages_new;
+
+ nr_hugepages -= pages;
+ hpi->num_pages[0] = pages_new;
+ if (pages_new == 0)
+ continue;
+ }
+
+ if (rte_eal_using_phys_addrs() &&
+ rte_eal_iova_mode() != RTE_IOVA_VA) {
+ /* find physical addresses for each hugepage */
+ if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+ RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
+ "for %u MB pages\n",
+ (unsigned int)(hpi->hugepage_sz / 0x100000));
+ goto fail;
+ }
+ } else {
+ /* set physical addresses for each hugepage */
+ if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+ RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
+ "for %u MB pages\n",
+ (unsigned int)(hpi->hugepage_sz / 0x100000));
+ goto fail;
+ }
+ }
+
+ if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
+ RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
+ (unsigned)(hpi->hugepage_sz / 0x100000));
+ goto fail;
+ }
+
+ qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
+ sizeof(struct hugepage_file), cmp_physaddr);
+
+ /* we have processed a num of hugepages of this size, so inc offset */
+ hp_offset += hpi->num_pages[0];
+ }
+
+ huge_recover_sigbus();
+
+ if (internal_config.memory == 0 && internal_config.force_sockets == 0)
+ internal_config.memory = eal_get_hugepage_mem_size();
+
+ nr_hugefiles = nr_hugepages;
+
+
+ /* clean out the numbers of pages */
+ for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+ internal_config.hugepage_info[i].num_pages[j] = 0;
+
+ /* get hugepages for each socket */
+ for (i = 0; i < nr_hugefiles; i++) {
+ int socket = tmp_hp[i].socket_id;
+
+ /* find a hugepage info with right size and increment num_pages */
+ const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
+ (int)internal_config.num_hugepage_sizes);
+ for (j = 0; j < nb_hpsizes; j++) {
+ if (tmp_hp[i].size ==
+ internal_config.hugepage_info[j].hugepage_sz) {
+ internal_config.hugepage_info[j].num_pages[socket]++;
+ }
+ }
+ }
+
+ /* make a copy of socket_mem, needed for number of pages calculation */
+ for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+ memory[i] = internal_config.socket_mem[i];
+
+ /* calculate final number of pages */
+ nr_hugepages = calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes);
+
+ /* error if not enough memory available */
+ if (nr_hugepages < 0)
+ goto fail;
+
+ /* reporting in! */
+ for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+ for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+ if (used_hp[i].num_pages[j] > 0) {
+ RTE_LOG(DEBUG, EAL,
+ "Requesting %u pages of size %uMB"
+ " from socket %i\n",
+ used_hp[i].num_pages[j],
+ (unsigned)
+ (used_hp[i].hugepage_sz / 0x100000),
+ j);
+ }
+ }
+ }
+
+ /* create shared memory */
+ hugepage = create_shared_memory(eal_hugepage_data_path(),
+ nr_hugefiles * sizeof(struct hugepage_file));
+
+ if (hugepage == NULL) {
+ RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+ goto fail;
+ }
+ memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
+
+ /*
+ * unmap pages that we won't need (looks at used_hp).
+ * also, sets final_va to NULL on pages that were unmapped.
+ */
+ if (unmap_unneeded_hugepages(tmp_hp, used_hp,
+ internal_config.num_hugepage_sizes) < 0) {
+ RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
+ goto fail;
+ }
+
+ /*
+ * copy stuff from malloc'd hugepage* to the actual shared memory.
+ * this procedure only copies those hugepages that have orig_va
+ * not NULL. has overflow protection.
+ */
+ if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
+ tmp_hp, nr_hugefiles) < 0) {
+ RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
+ goto fail;
+ }
+
+#ifndef RTE_ARCH_64
+ /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
+ if (internal_config.legacy_mem &&
+ prealloc_segments(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
+ goto fail;
+ }
+#endif
+
+ /* remap all pages we do need into memseg list VA space, so that those
+ * pages become first-class citizens in DPDK memory subsystem
+ */
+ if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+ RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
+ goto fail;
+ }
+
+ /* free the hugepage backing files */
+ if (internal_config.hugepage_unlink &&
+ unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
+ RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
+ goto fail;
+ }
+
+ /* free the temporary hugepage table */
+ free(tmp_hp);
+ tmp_hp = NULL;
+
+ munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
+ hugepage = NULL;
+
+ /* we're not going to allocate more pages, so release VA space for
+ * unused memseg lists
+ */
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ size_t mem_sz;
+
+ /* skip inactive lists */
+ if (msl->base_va == NULL)
+ continue;
+ /* skip lists where there is at least one page allocated */
+ if (msl->memseg_arr.count > 0)
+ continue;
+ /* this is an unused list, deallocate it */
+ mem_sz = msl->len;
+ munmap(msl->base_va, mem_sz);
+ msl->base_va = NULL;
+ msl->heap = 0;
+
+ /* destroy backing fbarray */
+ rte_fbarray_destroy(&msl->memseg_arr);
+ }
+
+ if (mcfg->dma_maskbits &&
+ rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+ RTE_LOG(ERR, EAL,
+ "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
+ __func__);
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ huge_recover_sigbus();
+ free(tmp_hp);
+ if (hugepage != NULL)
+ munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
+
+ return -1;
+}
+
+static int __rte_unused
+hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
+{
+ struct hugepage_info *hpi = arg;
+
+ if (msl->page_sz != hpi->hugepage_sz)
+ return 0;
+
+ hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
+ return 0;
+}
+
+static int
+limits_callback(int socket_id, size_t cur_limit, size_t new_len)
+{
+ RTE_SET_USED(socket_id);
+ RTE_SET_USED(cur_limit);
+ RTE_SET_USED(new_len);
+ return -1;
+}
+
+static int
+eal_hugepage_init(void)
+{
+ struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ uint64_t memory[RTE_MAX_NUMA_NODES];
+ int hp_sz_idx, socket_id;
+
+ memset(used_hp, 0, sizeof(used_hp));
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+#ifndef RTE_ARCH_64
+ struct hugepage_info dummy;
+ unsigned int i;
+#endif
+ /* also initialize used_hp hugepage sizes in used_hp */
+ struct hugepage_info *hpi;
+ hpi = &internal_config.hugepage_info[hp_sz_idx];
+ used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+
+#ifndef RTE_ARCH_64
+ /* for 32-bit, limit number of pages on socket to whatever we've
+ * preallocated, as we cannot allocate more.
+ */
+ memset(&dummy, 0, sizeof(dummy));
+ dummy.hugepage_sz = hpi->hugepage_sz;
+ if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
+ return -1;
+
+ for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
+ hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
+ dummy.num_pages[i]);
+ }
+#endif
+ }
+
+ /* make a copy of socket_mem, needed for balanced allocation. */
+ for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+ memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+ /* calculate final number of pages */
+ if (calc_num_pages_per_socket(memory,
+ internal_config.hugepage_info, used_hp,
+ internal_config.num_hugepage_sizes) < 0)
+ return -1;
+
+ for (hp_sz_idx = 0;
+ hp_sz_idx < (int)internal_config.num_hugepage_sizes;
+ hp_sz_idx++) {
+ for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+ socket_id++) {
+ struct rte_memseg **pages;
+ struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+ unsigned int num_pages = hpi->num_pages[socket_id];
+ unsigned int num_pages_alloc;
+
+ if (num_pages == 0)
+ continue;
+
+ RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
+ num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+ /* we may not be able to allocate all pages in one go,
+ * because we break up our memory map into multiple
+ * memseg lists. therefore, try allocating multiple
+ * times and see if we can get the desired number of
+ * pages from multiple allocations.
+ */
+
+ num_pages_alloc = 0;
+ do {
+ int i, cur_pages, needed;
+
+ needed = num_pages - num_pages_alloc;
+
+ pages = malloc(sizeof(*pages) * needed);
+
+ /* do not request exact number of pages */
+ cur_pages = eal_memalloc_alloc_seg_bulk(pages,
+ needed, hpi->hugepage_sz,
+ socket_id, false);
+ if (cur_pages <= 0) {
+ free(pages);
+ return -1;
+ }
+
+ /* mark preallocated pages as unfreeable */
+ for (i = 0; i < cur_pages; i++) {
+ struct rte_memseg *ms = pages[i];
+ ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+ }
+ free(pages);
+
+ num_pages_alloc += cur_pages;
+ } while (num_pages_alloc != num_pages);
+ }
+ }
+ /* if socket limits were specified, set them */
+ if (internal_config.force_socket_limits) {
+ unsigned int i;
+ for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+ uint64_t limit = internal_config.socket_limit[i];
+ if (limit == 0)
+ continue;
+ if (rte_mem_alloc_validator_register("socket-limit",
+ limits_callback, i, limit))
+ RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
+ }
+ }
+ return 0;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+ struct stat st;
+ if (fstat(fd, &st) < 0)
+ return 0;
+ return st.st_size;
+}
+
+/*
+ * This creates the memory mappings in the secondary process to match that of
+ * the server process. It goes through each memory segment in the DPDK runtime
+ * configuration and finds the hugepages which form that segment, mapping them
+ * in order to form a contiguous block in the virtual memory space
+ */
+static int
+eal_legacy_hugepage_attach(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct hugepage_file *hp = NULL;
+ unsigned int num_hp = 0;
+ unsigned int i = 0;
+ unsigned int cur_seg;
+ off_t size = 0;
+ int fd, fd_hugepage = -1;
+
+ if (aslr_enabled() > 0) {
+ RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
+ "(ASLR) is enabled in the kernel.\n");
+ RTE_LOG(WARNING, EAL, " This may cause issues with mapping memory "
+ "into secondary processes\n");
+ }
+
+ fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
+ if (fd_hugepage < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ eal_hugepage_data_path());
+ goto error;
+ }
+
+ size = getFileSize(fd_hugepage);
+ hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
+ if (hp == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+ eal_hugepage_data_path());
+ goto error;
+ }
+
+ num_hp = size / sizeof(struct hugepage_file);
+ RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
+
+ /* map all segments into memory to make sure we get the addrs. the
+ * segments themselves are already in memseg list (which is shared and
+ * has its VA space already preallocated), so we just need to map
+ * everything into correct addresses.
+ */
+ for (i = 0; i < num_hp; i++) {
+ struct hugepage_file *hf = &hp[i];
+ size_t map_sz = hf->size;
+ void *map_addr = hf->final_va;
+ int msl_idx, ms_idx;
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+
+ /* if size is zero, no more pages left */
+ if (map_sz == 0)
+ break;
+
+ fd = open(hf->filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto error;
+ }
+
+ map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FIXED, fd, 0);
+ if (map_addr == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
+ hf->filepath, strerror(errno));
+ goto fd_error;
+ }
+
+ /* set shared lock on the file. */
+ if (flock(fd, LOCK_SH) < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
+ __func__, strerror(errno));
+ goto mmap_error;
+ }
+
+ /* find segment data */
+ msl = rte_mem_virt2memseg_list(map_addr);
+ if (msl == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
+ __func__);
+ goto mmap_error;
+ }
+ ms = rte_mem_virt2memseg(map_addr, msl);
+ if (ms == NULL) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
+ __func__);
+ goto mmap_error;
+ }
+
+ msl_idx = msl - mcfg->memsegs;
+ ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+ if (ms_idx < 0) {
+ RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
+ __func__);
+ goto mmap_error;
+ }
+
+ /* store segment fd internally */
+ if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+ RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+ rte_strerror(rte_errno));
+ }
+ /* unmap the hugepage config file, since we are done using it */
+ munmap(hp, size);
+ close(fd_hugepage);
+ return 0;
+
+mmap_error:
+ munmap(hp[i].final_va, hp[i].size);
+fd_error:
+ close(fd);
+error:
+ /* unwind mmap's done so far */
+ for (cur_seg = 0; cur_seg < i; cur_seg++)
+ munmap(hp[cur_seg].final_va, hp[cur_seg].size);
+
+ if (hp != NULL && hp != MAP_FAILED)
+ munmap(hp, size);
+ if (fd_hugepage >= 0)
+ close(fd_hugepage);
+ return -1;
+}
+
+static int
+eal_hugepage_attach(void)
+{
+ if (eal_memalloc_sync_with_primary()) {
+ RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+ if (aslr_enabled() > 0)
+ RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
+ return -1;
+ }
+ return 0;
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_init() :
+ eal_hugepage_init();
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+ return internal_config.legacy_mem ?
+ eal_legacy_hugepage_attach() :
+ eal_hugepage_attach();
+}
+
+int
+rte_eal_using_phys_addrs(void)
+{
+ if (phys_addrs_available == -1) {
+ uint64_t tmp = 0;
+
+ if (rte_eal_has_hugepages() != 0 &&
+ rte_mem_virt2phy(&tmp) != RTE_BAD_PHYS_ADDR)
+ phys_addrs_available = 1;
+ else
+ phys_addrs_available = 0;
+ }
+ return phys_addrs_available;
+}
+
+static int __rte_unused
+memseg_primary_init_32(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int active_sockets, hpi_idx, msl_idx = 0;
+ unsigned int socket_id, i;
+ struct rte_memseg_list *msl;
+ uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
+ uint64_t max_mem;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /* this is a giant hack, but desperate times call for desperate
+ * measures. in legacy 32-bit mode, we cannot preallocate VA space,
+ * because having upwards of 2 gigabytes of VA space already mapped will
+ * interfere with our ability to map and sort hugepages.
+ *
+ * therefore, in legacy 32-bit mode, we will be initializing memseg
+ * lists much later - in eal_memory.c, right after we unmap all the
+ * unneeded pages. this will not affect secondary processes, as those
+ * should be able to mmap the space without (too many) problems.
+ */
+ if (internal_config.legacy_mem)
+ return 0;
+
+ /* 32-bit mode is a very special case. we cannot know in advance where
+ * the user will want to allocate their memory, so we have to do some
+ * heuristics.
+ */
+ active_sockets = 0;
+ total_requested_mem = 0;
+ if (internal_config.force_sockets)
+ for (i = 0; i < rte_socket_count(); i++) {
+ uint64_t mem;
+
+ socket_id = rte_socket_id_by_idx(i);
+ mem = internal_config.socket_mem[socket_id];
+
+ if (mem == 0)
+ continue;
+
+ active_sockets++;
+ total_requested_mem += mem;
+ }
+ else
+ total_requested_mem = internal_config.memory;
+
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ if (total_requested_mem > max_mem) {
+ RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
+ (unsigned int)(max_mem >> 20));
+ return -1;
+ }
+ total_extra_mem = max_mem - total_requested_mem;
+ extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
+ total_extra_mem / active_sockets;
+
+ /* the allocation logic is a little bit convoluted, but here's how it
+ * works, in a nutshell:
+ * - if user hasn't specified on which sockets to allocate memory via
+ * --socket-mem, we allocate all of our memory on master core socket.
+ * - if user has specified sockets to allocate memory on, there may be
+ * some "unused" memory left (e.g. if user has specified --socket-mem
+ * such that not all memory adds up to 2 gigabytes), so add it to all
+ * sockets that are in use equally.
+ *
+ * page sizes are sorted by size in descending order, so we can safely
+ * assume that we dispense with bigger page sizes first.
+ */
+
+ /* create memseg lists */
+ for (i = 0; i < rte_socket_count(); i++) {
+ int hp_sizes = (int) internal_config.num_hugepage_sizes;
+ uint64_t max_socket_mem, cur_socket_mem;
+ unsigned int master_lcore_socket;
+ struct rte_config *cfg = rte_eal_get_configuration();
+ bool skip;
+
+ socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ /* we can still sort pages by socket in legacy mode */
+ if (!internal_config.legacy_mem && socket_id > 0)
+ break;
+#endif
+
+ /* if we didn't specifically request memory on this socket */
+ skip = active_sockets != 0 &&
+ internal_config.socket_mem[socket_id] == 0;
+ /* ...or if we didn't specifically request memory on *any*
+ * socket, and this is not master lcore
+ */
+ master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
+ skip |= active_sockets == 0 && socket_id != master_lcore_socket;
+
+ if (skip) {
+ RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
+ socket_id);
+ continue;
+ }
+
+ /* max amount of memory on this socket */
+ max_socket_mem = (active_sockets != 0 ?
+ internal_config.socket_mem[socket_id] :
+ internal_config.memory) +
+ extra_mem_per_socket;
+ cur_socket_mem = 0;
+
+ for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
+ uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
+ uint64_t hugepage_sz;
+ struct hugepage_info *hpi;
+ int type_msl_idx, max_segs, total_segs = 0;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ /* check if pages are actually available */
+ if (hpi->num_pages[socket_id] == 0)
+ continue;
+
+ max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+ max_pagesz_mem = max_socket_mem - cur_socket_mem;
+
+ /* make it multiple of page size */
+ max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
+ hugepage_sz);
+
+ RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
+ "%" PRIu64 "M on socket %i\n",
+ max_pagesz_mem >> 20, socket_id);
+
+ type_msl_idx = 0;
+ while (cur_pagesz_mem < max_pagesz_mem &&
+ total_segs < max_segs) {
+ uint64_t cur_mem;
+ unsigned int n_segs;
+
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ return -1;
+ }
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ cur_mem = get_mem_amount(hugepage_sz,
+ max_pagesz_mem);
+ n_segs = cur_mem / hugepage_sz;
+
+ if (alloc_memseg_list(msl, hugepage_sz, n_segs,
+ socket_id, type_msl_idx)) {
+ /* failing to allocate a memseg list is
+ * a serious error.
+ */
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ if (alloc_va_space(msl)) {
+ /* if we couldn't allocate VA space, we
+ * can try with smaller page sizes.
+ */
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n");
+ /* deallocate memseg list */
+ if (free_memseg_list(msl))
+ return -1;
+ break;
+ }
+
+ total_segs += msl->memseg_arr.len;
+ cur_pagesz_mem = total_segs * hugepage_sz;
+ type_msl_idx++;
+ msl_idx++;
+ }
+ cur_socket_mem += cur_pagesz_mem;
+ }
+ if (cur_socket_mem == 0) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n",
+ socket_id);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int __rte_unused
+memseg_primary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ struct memtype {
+ uint64_t page_sz;
+ int socket_id;
+ } *memtypes = NULL;
+ int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
+ struct rte_memseg_list *msl;
+ uint64_t max_mem, max_mem_per_type;
+ unsigned int max_seglists_per_type;
+ unsigned int n_memtypes, cur_type;
+
+ /* no-huge does not need this at all */
+ if (internal_config.no_hugetlbfs)
+ return 0;
+
+ /*
+ * figuring out amount of memory we're going to have is a long and very
+ * involved process. the basic element we're operating with is a memory
+ * type, defined as a combination of NUMA node ID and page size (so that
+ * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+ *
+ * deciding amount of memory going towards each memory type is a
+ * balancing act between maximum segments per type, maximum memory per
+ * type, and number of detected NUMA nodes. the goal is to make sure
+ * each memory type gets at least one memseg list.
+ *
+ * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+ *
+ * the total amount of memory per type is limited by either
+ * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+ * of detected NUMA nodes. additionally, maximum number of segments per
+ * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+ * smaller page sizes, it can take hundreds of thousands of segments to
+ * reach the above specified per-type memory limits.
+ *
+ * additionally, each type may have multiple memseg lists associated
+ * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+ * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+ *
+ * the number of memseg lists per type is decided based on the above
+ * limits, and also taking number of detected NUMA nodes, to make sure
+ * that we don't run out of memseg lists before we populate all NUMA
+ * nodes with memory.
+ *
+ * we do this in three stages. first, we collect the number of types.
+ * then, we figure out memory constraints and populate the list of
+ * would-be memseg lists. then, we go ahead and allocate the memseg
+ * lists.
+ */
+
+ /* create space for mem types */
+ n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+ memtypes = calloc(n_memtypes, sizeof(*memtypes));
+ if (memtypes == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+ return -1;
+ }
+
+ /* populate mem types */
+ cur_type = 0;
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+ int socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ /* we can still sort pages by socket in legacy mode */
+ if (!internal_config.legacy_mem && socket_id > 0)
+ break;
+#endif
+ memtypes[cur_type].page_sz = hugepage_sz;
+ memtypes[cur_type].socket_id = socket_id;
+
+ RTE_LOG(DEBUG, EAL, "Detected memory type: "
+ "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+ socket_id, hugepage_sz);
+ }
+ }
+ /* number of memtypes could have been lower due to no NUMA support */
+ n_memtypes = cur_type;
+
+ /* set up limits for types */
+ max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+ max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+ max_mem / n_memtypes);
+ /*
+ * limit maximum number of segment lists per type to ensure there's
+ * space for memseg lists for all NUMA nodes with all page sizes
+ */
+ max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
+
+ if (max_seglists_per_type == 0) {
+ RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+
+ /* go through all mem types and create segment lists */
+ msl_idx = 0;
+ for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+ unsigned int cur_seglist, n_seglists, n_segs;
+ unsigned int max_segs_per_type, max_segs_per_list;
+ struct memtype *type = &memtypes[cur_type];
+ uint64_t max_mem_per_list, pagesz;
+ int socket_id;
+
+ pagesz = type->page_sz;
+ socket_id = type->socket_id;
+
+ /*
+ * we need to create segment lists for this type. we must take
+ * into account the following things:
+ *
+ * 1. total amount of memory we can use for this memory type
+ * 2. total amount of memory per memseg list allowed
+ * 3. number of segments needed to fit the amount of memory
+ * 4. number of segments allowed per type
+ * 5. number of segments allowed per memseg list
+ * 6. number of memseg lists we are allowed to take up
+ */
+
+ /* calculate how much segments we will need in total */
+ max_segs_per_type = max_mem_per_type / pagesz;
+ /* limit number of segments to maximum allowed per type */
+ max_segs_per_type = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+ /* limit number of segments to maximum allowed per list */
+ max_segs_per_list = RTE_MIN(max_segs_per_type,
+ (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+ /* calculate how much memory we can have per segment list */
+ max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+ (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+ /* calculate how many segments each segment list will have */
+ n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+ /* calculate how many segment lists we can have */
+ n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+ max_mem_per_type / max_mem_per_list);
+
+ /* limit number of segment lists according to our maximum */
+ n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+ RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+ "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+ n_seglists, n_segs, socket_id, pagesz);
+
+ /* create all segment lists */
+ for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists, please increase %s\n",
+ RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+ goto out;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, pagesz, n_segs,
+ socket_id, cur_seglist))
+ goto out;
+
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+ goto out;
+ }
+ }
+ }
+ /* we're successful */
+ ret = 0;
+out:
+ free(memtypes);
+ return ret;
+}
+
+static int
+memseg_secondary_init(void)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+ msl = &mcfg->memsegs[msl_idx];
+
+ /* skip empty memseg lists */
+ if (msl->memseg_arr.len == 0)
+ continue;
+
+ if (rte_fbarray_attach(&msl->memseg_arr)) {
+ RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+ return -1;
+ }
+
+ /* preallocate VA space */
+ if (alloc_va_space(msl)) {
+ RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+ /* increase rlimit to maximum */
+ struct rlimit lim;
+
+ if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+ /* set limit to maximum */
+ lim.rlim_cur = lim.rlim_max;
+
+ if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
+ strerror(errno));
+ } else {
+ RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
+ PRIu64 "\n",
+ (uint64_t)lim.rlim_cur);
+ }
+ } else {
+ RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
+ }
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+ if (!internal_config.legacy_mem && rte_socket_count() > 1) {
+ RTE_LOG(WARNING, EAL, "DPDK is running on a NUMA system, but is compiled without NUMA support.\n");
+ RTE_LOG(WARNING, EAL, "This will have adverse consequences for performance and usability.\n");
+ RTE_LOG(WARNING, EAL, "Please use --"OPT_LEGACY_MEM" option, or recompile with NUMA support.\n");
+ }
+#endif
+
+ return rte_eal_process_type() == RTE_PROC_PRIMARY ?
+#ifndef RTE_ARCH_64
+ memseg_primary_init_32() :
+#else
+ memseg_primary_init() :
+#endif
+ memseg_secondary_init();
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include <rte_debug.h>
+#include <rte_atomic.h>
+#include <rte_launch.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
+RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
+RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
+
+/*
+ * Send a message to a slave lcore identified by slave_id to call a
+ * function f with argument arg. Once the execution is done, the
+ * remote lcore switch in FINISHED state.
+ */
+int
+rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
+{
+ int n;
+ char c = 0;
+ int m2s = lcore_config[slave_id].pipe_master2slave[1];
+ int s2m = lcore_config[slave_id].pipe_slave2master[0];
+
+ if (lcore_config[slave_id].state != WAIT)
+ return -EBUSY;
+
+ lcore_config[slave_id].f = f;
+ lcore_config[slave_id].arg = arg;
+
+ /* send message */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = write(m2s, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ /* wait ack */
+ do {
+ n = read(s2m, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ return 0;
+}
+
+/* set affinity for current EAL thread */
+static int
+eal_thread_set_affinity(void)
+{
+ unsigned lcore_id = rte_lcore_id();
+
+ /* acquire system unique id */
+ rte_gettid();
+
+ /* update EAL thread core affinity */
+ return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
+}
+
+void eal_thread_init_master(unsigned lcore_id)
+{
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+ /* set CPU affinity */
+ if (eal_thread_set_affinity() < 0)
+ rte_panic("cannot set affinity\n");
+}
+
+/* main loop of threads */
+__attribute__((noreturn)) void *
+eal_thread_loop(__attribute__((unused)) void *arg)
+{
+ char c;
+ int n, ret;
+ unsigned lcore_id;
+ pthread_t thread_id;
+ int m2s, s2m;
+ char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+
+ thread_id = pthread_self();
+
+ /* retrieve our lcore_id from the configuration structure */
+ RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+ if (thread_id == lcore_config[lcore_id].thread_id)
+ break;
+ }
+ if (lcore_id == RTE_MAX_LCORE)
+ rte_panic("cannot retrieve lcore id\n");
+
+ m2s = lcore_config[lcore_id].pipe_master2slave[0];
+ s2m = lcore_config[lcore_id].pipe_slave2master[1];
+
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+ /* set CPU affinity */
+ if (eal_thread_set_affinity() < 0)
+ rte_panic("cannot set affinity\n");
+
+ ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+ lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
+
+ /* read on our pipe to get commands */
+ while (1) {
+ void *fct_arg;
+
+ /* wait command */
+ do {
+ n = read(m2s, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ lcore_config[lcore_id].state = RUNNING;
+
+ /* send ack */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = write(s2m, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ if (lcore_config[lcore_id].f == NULL)
+ rte_panic("NULL function pointer\n");
+
+ /* call the function and store the return value */
+ fct_arg = lcore_config[lcore_id].arg;
+ ret = lcore_config[lcore_id].f(fct_arg);
+ lcore_config[lcore_id].ret = ret;
+ rte_wmb();
+
+ /* when a service core returns, it should go directly to WAIT
+ * state, because the application will not lcore_wait() for it.
+ */
+ if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
+ lcore_config[lcore_id].state = WAIT;
+ else
+ lcore_config[lcore_id].state = FINISHED;
+ }
+
+ /* never reached */
+ /* pthread_exit(NULL); */
+ /* return NULL; */
+}
+
+/* require calling thread tid by gettid() */
+int rte_sys_gettid(void)
+{
+ return (int)syscall(SYS_gettid);
+}
+
+int rte_thread_setname(pthread_t id, const char *name)
+{
+ int ret = ENOSYS;
+#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+ ret = pthread_setname_np(id, name);
+#endif
+#endif
+ RTE_SET_USED(id);
+ RTE_SET_USED(name);
+ return -ret;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation.
+ * Copyright(c) 2012-2013 6WIND S.A.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+enum timer_source eal_timer_source = EAL_TIMER_HPET;
+
+#ifdef RTE_LIBEAL_USE_HPET
+
+#define DEV_HPET "/dev/hpet"
+
+/* Maximum number of counters. */
+#define HPET_TIMER_NUM 3
+
+/* General capabilities register */
+#define CLK_PERIOD_SHIFT 32 /* Clock period shift. */
+#define CLK_PERIOD_MASK 0xffffffff00000000ULL /* Clock period mask. */
+
+/**
+ * HPET timer registers. From the Intel IA-PC HPET (High Precision Event
+ * Timers) Specification.
+ */
+struct eal_hpet_regs {
+ /* Memory-mapped, software visible registers */
+ uint64_t capabilities; /**< RO General Capabilities Register. */
+ uint64_t reserved0; /**< Reserved for future use. */
+ uint64_t config; /**< RW General Configuration Register. */
+ uint64_t reserved1; /**< Reserved for future use. */
+ uint64_t isr; /**< RW Clear General Interrupt Status. */
+ uint64_t reserved2[25]; /**< Reserved for future use. */
+ union {
+ uint64_t counter; /**< RW Main Counter Value Register. */
+ struct {
+ uint32_t counter_l; /**< RW Main Counter Low. */
+ uint32_t counter_h; /**< RW Main Counter High. */
+ };
+ };
+ uint64_t reserved3; /**< Reserved for future use. */
+ struct {
+ uint64_t config; /**< RW Timer Config and Capability Reg. */
+ uint64_t comp; /**< RW Timer Comparator Value Register. */
+ uint64_t fsb; /**< RW FSB Interrupt Route Register. */
+ uint64_t reserved4; /**< Reserved for future use. */
+ } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */
+};
+
+/* Mmap'd hpet registers */
+static volatile struct eal_hpet_regs *eal_hpet = NULL;
+
+/* Period at which the HPET counter increments in
+ * femtoseconds (10^-15 seconds). */
+static uint32_t eal_hpet_resolution_fs = 0;
+
+/* Frequency of the HPET counter in Hz */
+static uint64_t eal_hpet_resolution_hz = 0;
+
+/* Incremented 4 times during one 32bits hpet full count */
+static uint32_t eal_hpet_msb;
+
+static pthread_t msb_inc_thread_id;
+
+/*
+ * This function runs on a specific thread to update a global variable
+ * containing used to process MSB of the HPET (unfortunately, we need
+ * this because hpet is 32 bits by default under linux).
+ */
+static void *
+hpet_msb_inc(__attribute__((unused)) void *arg)
+{
+ uint32_t t;
+
+ while (1) {
+ t = (eal_hpet->counter_l >> 30);
+ if (t != (eal_hpet_msb & 3))
+ eal_hpet_msb ++;
+ sleep(10);
+ }
+ return NULL;
+}
+
+uint64_t
+rte_get_hpet_hz(void)
+{
+ if(internal_config.no_hpet)
+ rte_panic("Error, HPET called, but no HPET present\n");
+
+ return eal_hpet_resolution_hz;
+}
+
+uint64_t
+rte_get_hpet_cycles(void)
+{
+ uint32_t t, msb;
+ uint64_t ret;
+
+ if(internal_config.no_hpet)
+ rte_panic("Error, HPET called, but no HPET present\n");
+
+ t = eal_hpet->counter_l;
+ msb = eal_hpet_msb;
+ ret = (msb + 2 - (t >> 30)) / 4;
+ ret <<= 32;
+ ret += t;
+ return ret;
+}
+
+#endif
+
+#ifdef RTE_LIBEAL_USE_HPET
+/*
+ * Open and mmap /dev/hpet (high precision event timer) that will
+ * provide our time reference.
+ */
+int
+rte_eal_hpet_init(int make_default)
+{
+ int fd, ret;
+
+ if (internal_config.no_hpet) {
+ RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
+ return -1;
+ }
+
+ fd = open(DEV_HPET, O_RDONLY);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n",
+ strerror(errno));
+ internal_config.no_hpet = 1;
+ return -1;
+ }
+ eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0);
+ if (eal_hpet == MAP_FAILED) {
+ RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n"
+ "Please enable CONFIG_HPET_MMAP in your kernel configuration "
+ "to allow HPET support.\n"
+ "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n "
+ "in your build configuration or use '--no-hpet' EAL flag.\n");
+ close(fd);
+ internal_config.no_hpet = 1;
+ return -1;
+ }
+ close(fd);
+
+ eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities &
+ CLK_PERIOD_MASK) >>
+ CLK_PERIOD_SHIFT);
+
+ eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) /
+ (uint64_t)eal_hpet_resolution_fs;
+
+ RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n",
+ eal_hpet_resolution_hz/1000);
+
+ eal_hpet_msb = (eal_hpet->counter_l >> 30);
+
+ /* create a thread that will increment a global variable for
+ * msb (hpet is 32 bits by default under linux) */
+ ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
+ hpet_msb_inc, NULL);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
+ internal_config.no_hpet = 1;
+ return -1;
+ }
+
+ if (make_default)
+ eal_timer_source = EAL_TIMER_HPET;
+ return 0;
+}
+#endif
+
+uint64_t
+get_tsc_freq(void)
+{
+#ifdef CLOCK_MONOTONIC_RAW
+#define NS_PER_SEC 1E9
+#define CYC_PER_10MHZ 1E7
+
+ struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */
+
+ struct timespec t_start, t_end;
+ uint64_t tsc_hz;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) {
+ uint64_t ns, end, start = rte_rdtsc();
+ nanosleep(&sleeptime,NULL);
+ clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
+ end = rte_rdtsc();
+ ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC);
+ ns += (t_end.tv_nsec - t_start.tv_nsec);
+
+ double secs = (double)ns/NS_PER_SEC;
+ tsc_hz = (uint64_t)((end - start)/secs);
+ /* Round up to 10Mhz. 1E7 ~ 10Mhz */
+ return RTE_ALIGN_MUL_NEAR(tsc_hz, CYC_PER_10MHZ);
+ }
+#endif
+ return 0;
+}
+
+int
+rte_eal_timer_init(void)
+{
+
+ eal_timer_source = EAL_TIMER_TSC;
+
+ set_tsc_freq();
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+#include <rte_vfio.h>
+
+#include "eal_filesystem.h"
+#include "eal_memcfg.h"
+#include "eal_vfio.h"
+#include "eal_private.h"
+
+#ifdef VFIO_PRESENT
+
+#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
+
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define VFIO_MAX_USER_MEM_MAPS 256
+struct user_mem_map {
+ uint64_t addr;
+ uint64_t iova;
+ uint64_t len;
+};
+
+struct user_mem_maps {
+ rte_spinlock_recursive_t lock;
+ int n_maps;
+ struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
+};
+
+struct vfio_config {
+ int vfio_enabled;
+ int vfio_container_fd;
+ int vfio_active_groups;
+ const struct vfio_iommu_type *vfio_iommu_type;
+ struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+ struct user_mem_maps mem_maps;
+};
+
+/* per-process VFIO config */
+static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
+static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
+
+static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
+ uint64_t iova, uint64_t len, int do_map);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+ /* x86 IOMMU, otherwise known as type 1 */
+ {
+ .type_id = RTE_VFIO_TYPE1,
+ .name = "Type 1",
+ .dma_map_func = &vfio_type1_dma_map,
+ .dma_user_map_func = &vfio_type1_dma_mem_map
+ },
+ /* ppc64 IOMMU, otherwise known as spapr */
+ {
+ .type_id = RTE_VFIO_SPAPR,
+ .name = "sPAPR",
+ .dma_map_func = &vfio_spapr_dma_map,
+ .dma_user_map_func = &vfio_spapr_dma_mem_map
+ },
+ /* IOMMU-less mode */
+ {
+ .type_id = RTE_VFIO_NOIOMMU,
+ .name = "No-IOMMU",
+ .dma_map_func = &vfio_noiommu_dma_map,
+ .dma_user_map_func = &vfio_noiommu_dma_mem_map
+ },
+};
+
+static int
+is_null_map(const struct user_mem_map *map)
+{
+ return map->addr == 0 && map->iova == 0 && map->len == 0;
+}
+
+/* we may need to merge user mem maps together in case of user mapping/unmapping
+ * chunks of memory, so we'll need a comparator function to sort segments.
+ */
+static int
+user_mem_map_cmp(const void *a, const void *b)
+{
+ const struct user_mem_map *umm_a = a;
+ const struct user_mem_map *umm_b = b;
+
+ /* move null entries to end */
+ if (is_null_map(umm_a))
+ return 1;
+ if (is_null_map(umm_b))
+ return -1;
+
+ /* sort by iova first */
+ if (umm_a->iova < umm_b->iova)
+ return -1;
+ if (umm_a->iova > umm_b->iova)
+ return 1;
+
+ if (umm_a->addr < umm_b->addr)
+ return -1;
+ if (umm_a->addr > umm_b->addr)
+ return 1;
+
+ if (umm_a->len < umm_b->len)
+ return -1;
+ if (umm_a->len > umm_b->len)
+ return 1;
+
+ return 0;
+}
+
+/* adjust user map entry. this may result in shortening of existing map, or in
+ * splitting existing map in two pieces.
+ */
+static void
+adjust_map(struct user_mem_map *src, struct user_mem_map *end,
+ uint64_t remove_va_start, uint64_t remove_len)
+{
+ /* if va start is same as start address, we're simply moving start */
+ if (remove_va_start == src->addr) {
+ src->addr += remove_len;
+ src->iova += remove_len;
+ src->len -= remove_len;
+ } else if (remove_va_start + remove_len == src->addr + src->len) {
+ /* we're shrinking mapping from the end */
+ src->len -= remove_len;
+ } else {
+ /* we're blowing a hole in the middle */
+ struct user_mem_map tmp;
+ uint64_t total_len = src->len;
+
+ /* adjust source segment length */
+ src->len = remove_va_start - src->addr;
+
+ /* create temporary segment in the middle */
+ tmp.addr = src->addr + src->len;
+ tmp.iova = src->iova + src->len;
+ tmp.len = remove_len;
+
+ /* populate end segment - this one we will be keeping */
+ end->addr = tmp.addr + tmp.len;
+ end->iova = tmp.iova + tmp.len;
+ end->len = total_len - src->len - tmp.len;
+ }
+}
+
+/* try merging two maps into one, return 1 if succeeded */
+static int
+merge_map(struct user_mem_map *left, struct user_mem_map *right)
+{
+ if (left->addr + left->len != right->addr)
+ return 0;
+ if (left->iova + left->len != right->iova)
+ return 0;
+
+ left->len += right->len;
+
+ memset(right, 0, sizeof(*right));
+
+ return 1;
+}
+
+static struct user_mem_map *
+find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
+ uint64_t iova, uint64_t len)
+{
+ uint64_t va_end = addr + len;
+ uint64_t iova_end = iova + len;
+ int i;
+
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ struct user_mem_map *map = &user_mem_maps->maps[i];
+ uint64_t map_va_end = map->addr + map->len;
+ uint64_t map_iova_end = map->iova + map->len;
+
+ /* check start VA */
+ if (addr < map->addr || addr >= map_va_end)
+ continue;
+ /* check if VA end is within boundaries */
+ if (va_end <= map->addr || va_end > map_va_end)
+ continue;
+
+ /* check start IOVA */
+ if (iova < map->iova || iova >= map_iova_end)
+ continue;
+ /* check if IOVA end is within boundaries */
+ if (iova_end <= map->iova || iova_end > map_iova_end)
+ continue;
+
+ /* we've found our map */
+ return map;
+ }
+ return NULL;
+}
+
+/* this will sort all user maps, and merge/compact any adjacent maps */
+static void
+compact_user_maps(struct user_mem_maps *user_mem_maps)
+{
+ int i, n_merged, cur_idx;
+
+ qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+ sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
+
+ /* we'll go over the list backwards when merging */
+ n_merged = 0;
+ for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+ struct user_mem_map *l, *r;
+
+ l = &user_mem_maps->maps[i];
+ r = &user_mem_maps->maps[i + 1];
+
+ if (is_null_map(l) || is_null_map(r))
+ continue;
+
+ if (merge_map(l, r))
+ n_merged++;
+ }
+
+ /* the entries are still sorted, but now they have holes in them, so
+ * walk through the list and remove the holes
+ */
+ if (n_merged > 0) {
+ cur_idx = 0;
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ if (!is_null_map(&user_mem_maps->maps[i])) {
+ struct user_mem_map *src, *dst;
+
+ src = &user_mem_maps->maps[i];
+ dst = &user_mem_maps->maps[cur_idx++];
+
+ if (src != dst) {
+ memcpy(dst, src, sizeof(*src));
+ memset(src, 0, sizeof(*src));
+ }
+ }
+ }
+ user_mem_maps->n_maps = cur_idx;
+ }
+}
+
+static int
+vfio_open_group_fd(int iommu_group_num)
+{
+ int vfio_group_fd;
+ char filename[PATH_MAX];
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ /* if primary, try to open the group */
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* try regular group format */
+ snprintf(filename, sizeof(filename),
+ VFIO_GROUP_FMT, iommu_group_num);
+ vfio_group_fd = open(filename, O_RDWR);
+ if (vfio_group_fd < 0) {
+ /* if file not found, it's not an error */
+ if (errno != ENOENT) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+ strerror(errno));
+ return -1;
+ }
+
+ /* special case: try no-IOMMU path as well */
+ snprintf(filename, sizeof(filename),
+ VFIO_NOIOMMU_GROUP_FMT,
+ iommu_group_num);
+ vfio_group_fd = open(filename, O_RDWR);
+ if (vfio_group_fd < 0) {
+ if (errno != ENOENT) {
+ RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+ }
+ /* noiommu group found */
+ }
+
+ return vfio_group_fd;
+ }
+ /* if we're in a secondary process, request group fd from the primary
+ * process via mp channel.
+ */
+ p->req = SOCKET_REQ_GROUP;
+ p->group_num = iommu_group_num;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_group_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_group_fd = mp_rep->fds[0];
+ } else if (p->result == SOCKET_NO_FD) {
+ RTE_LOG(ERR, EAL, " bad VFIO group fd\n");
+ vfio_group_fd = 0;
+ }
+ }
+
+ free(mp_reply.msgs);
+ if (vfio_group_fd < 0)
+ RTE_LOG(ERR, EAL, " cannot request group fd\n");
+ return vfio_group_fd;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_num(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+ if (vfio_cfg->vfio_groups[j].group_num ==
+ iommu_group_num)
+ return vfio_cfg;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+ int iommu_group_num)
+{
+ int i;
+ int vfio_group_fd;
+ struct vfio_group *cur_grp;
+
+ /* check if we already have the group descriptor open */
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
+ return vfio_cfg->vfio_groups[i].fd;
+
+ /* Lets see first if there is room for a new group */
+ if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+ RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+ return -1;
+ }
+
+ /* Now lets get an index for the new group */
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg->vfio_groups[i].group_num == -1) {
+ cur_grp = &vfio_cfg->vfio_groups[i];
+ break;
+ }
+
+ /* This should not happen */
+ if (i == VFIO_MAX_GROUPS) {
+ RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+ return -1;
+ }
+
+ vfio_group_fd = vfio_open_group_fd(iommu_group_num);
+ if (vfio_group_fd < 0) {
+ RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
+ return -1;
+ }
+
+ cur_grp->group_num = iommu_group_num;
+ cur_grp->fd = vfio_group_fd;
+ vfio_cfg->vfio_active_groups++;
+
+ return vfio_group_fd;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return vfio_cfg;
+ }
+
+ return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+ int i;
+
+ if (container_fd == RTE_VFIO_DEFAULT_CONTAINER_FD)
+ return default_vfio_cfg;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == container_fd)
+ return &vfio_cfgs[i];
+ }
+
+ return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i, j;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfg = &vfio_cfgs[i];
+ for (j = 0; j < VFIO_MAX_GROUPS; j++)
+ if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+ return j;
+ }
+
+ return -1;
+}
+
+static void
+vfio_group_device_get(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i;
+
+ vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ return;
+ }
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ else
+ vfio_cfg->vfio_groups[i].devices++;
+}
+
+static void
+vfio_group_device_put(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i;
+
+ vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ return;
+ }
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ else
+ vfio_cfg->vfio_groups[i].devices--;
+}
+
+static int
+vfio_group_device_count(int vfio_group_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i;
+
+ vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ return -1;
+ }
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
+ RTE_LOG(ERR, EAL, " wrong vfio_group index (%d)\n", i);
+ return -1;
+ }
+
+ return vfio_cfg->vfio_groups[i].devices;
+}
+
+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
+ void *arg __rte_unused)
+{
+ rte_iova_t iova_start, iova_expected;
+ struct rte_memseg_list *msl;
+ struct rte_memseg *ms;
+ size_t cur_len = 0;
+ uint64_t va_start;
+
+ msl = rte_mem_virt2memseg_list(addr);
+
+ /* for IOVA as VA mode, no need to care for IOVA addresses */
+ if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+ uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+ if (type == RTE_MEM_EVENT_ALLOC)
+ vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+ len, 1);
+ else
+ vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+ len, 0);
+ return;
+ }
+
+#ifdef RTE_ARCH_PPC_64
+ ms = rte_mem_virt2memseg(addr, msl);
+ while (cur_len < len) {
+ int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+ rte_fbarray_set_free(&msl->memseg_arr, idx);
+ cur_len += ms->len;
+ ++ms;
+ }
+ cur_len = 0;
+#endif
+ /* memsegs are contiguous in memory */
+ ms = rte_mem_virt2memseg(addr, msl);
+
+ /*
+ * This memory is not guaranteed to be contiguous, but it still could
+ * be, or it could have some small contiguous chunks. Since the number
+ * of VFIO mappings is limited, and VFIO appears to not concatenate
+ * adjacent mappings, we have to do this ourselves.
+ *
+ * So, find contiguous chunks, then map them.
+ */
+ va_start = ms->addr_64;
+ iova_start = iova_expected = ms->iova;
+ while (cur_len < len) {
+ bool new_contig_area = ms->iova != iova_expected;
+ bool last_seg = (len - cur_len) == ms->len;
+ bool skip_last = false;
+
+ /* only do mappings when current contiguous area ends */
+ if (new_contig_area) {
+ if (type == RTE_MEM_EVENT_ALLOC)
+ vfio_dma_mem_map(default_vfio_cfg, va_start,
+ iova_start,
+ iova_expected - iova_start, 1);
+ else
+ vfio_dma_mem_map(default_vfio_cfg, va_start,
+ iova_start,
+ iova_expected - iova_start, 0);
+ va_start = ms->addr_64;
+ iova_start = ms->iova;
+ }
+ /* some memory segments may have invalid IOVA */
+ if (ms->iova == RTE_BAD_IOVA) {
+ RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+ ms->addr);
+ skip_last = true;
+ }
+ iova_expected = ms->iova + ms->len;
+ cur_len += ms->len;
+ ++ms;
+
+ /*
+ * don't count previous segment, and don't attempt to
+ * dereference a potentially invalid pointer.
+ */
+ if (skip_last && !last_seg) {
+ iova_expected = iova_start = ms->iova;
+ va_start = ms->addr_64;
+ } else if (!skip_last && last_seg) {
+ /* this is the last segment and we're not skipping */
+ if (type == RTE_MEM_EVENT_ALLOC)
+ vfio_dma_mem_map(default_vfio_cfg, va_start,
+ iova_start,
+ iova_expected - iova_start, 1);
+ else
+ vfio_dma_mem_map(default_vfio_cfg, va_start,
+ iova_start,
+ iova_expected - iova_start, 0);
+ }
+ }
+#ifdef RTE_ARCH_PPC_64
+ cur_len = 0;
+ ms = rte_mem_virt2memseg(addr, msl);
+ while (cur_len < len) {
+ int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+
+ rte_fbarray_set_used(&msl->memseg_arr, idx);
+ cur_len += ms->len;
+ ++ms;
+ }
+#endif
+}
+
+static int
+vfio_sync_default_container(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+ int iommu_type_id;
+ unsigned int i;
+
+ /* cannot be called from primary */
+ if (rte_eal_process_type() != RTE_PROC_SECONDARY)
+ return -1;
+
+ /* default container fd should have been opened in rte_vfio_enable() */
+ if (!default_vfio_cfg->vfio_enabled ||
+ default_vfio_cfg->vfio_container_fd < 0) {
+ RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
+ return -1;
+ }
+
+ /* find default container's IOMMU type */
+ p->req = SOCKET_REQ_IOMMU_TYPE;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ iommu_type_id = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK)
+ iommu_type_id = p->iommu_type_id;
+ }
+ free(mp_reply.msgs);
+ if (iommu_type_id < 0) {
+ RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
+ return -1;
+ }
+
+ /* we now have an fd for default container, as well as its IOMMU type.
+ * now, set up default VFIO container config to match.
+ */
+ for (i = 0; i < RTE_DIM(iommu_types); i++) {
+ const struct vfio_iommu_type *t = &iommu_types[i];
+ if (t->type_id != iommu_type_id)
+ continue;
+
+ /* we found our IOMMU type */
+ default_vfio_cfg->vfio_iommu_type = t;
+
+ return 0;
+ }
+ RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
+ iommu_type_id);
+ return -1;
+}
+
+int
+rte_vfio_clear_group(int vfio_group_fd)
+{
+ int i;
+ struct vfio_config *vfio_cfg;
+
+ vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid group fd!\n");
+ return -1;
+ }
+
+ i = get_vfio_group_idx(vfio_group_fd);
+ if (i < 0)
+ return -1;
+ vfio_cfg->vfio_groups[i].group_num = -1;
+ vfio_cfg->vfio_groups[i].fd = -1;
+ vfio_cfg->vfio_groups[i].devices = 0;
+ vfio_cfg->vfio_active_groups--;
+
+ return 0;
+}
+
+int
+rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+ int *vfio_dev_fd, struct vfio_device_info *device_info)
+{
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+ struct vfio_config *vfio_cfg;
+ struct user_mem_maps *user_mem_maps;
+ int vfio_container_fd;
+ int vfio_group_fd;
+ int iommu_group_num;
+ int i, ret;
+
+ /* get group number */
+ ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
+ if (ret == 0) {
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ dev_addr);
+ return 1;
+ }
+
+ /* if negative, something failed */
+ if (ret < 0)
+ return -1;
+
+ /* get the actual group fd */
+ vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+ if (vfio_group_fd < 0)
+ return -1;
+
+ /* if group_fd == 0, that means the device isn't managed by VFIO */
+ if (vfio_group_fd == 0) {
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ dev_addr);
+ return 1;
+ }
+
+ /*
+ * at this point, we know that this group is viable (meaning, all devices
+ * are either bound to VFIO or not bound to anything)
+ */
+
+ /* check if the group is viable */
+ ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " %s cannot get group status, "
+ "error %i (%s)\n", dev_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ RTE_LOG(ERR, EAL, " %s VFIO group is not viable! "
+ "Not all devices in IOMMU group bound to VFIO or unbound\n",
+ dev_addr);
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+ vfio_container_fd = vfio_cfg->vfio_container_fd;
+ user_mem_maps = &vfio_cfg->mem_maps;
+
+ /* check if group does not have a container yet */
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+ /* add group to a container */
+ ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+ &vfio_container_fd);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
+ "error %i (%s)\n", dev_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+
+ /*
+ * pick an IOMMU type and set up DMA mappings for container
+ *
+ * needs to be done only once, only when first group is
+ * assigned to a container and only in primary process.
+ * Note this can happen several times with the hotplug
+ * functionality.
+ */
+ if (internal_config.process_type == RTE_PROC_PRIMARY &&
+ vfio_cfg->vfio_active_groups == 1 &&
+ vfio_group_device_count(vfio_group_fd) == 0) {
+ const struct vfio_iommu_type *t;
+
+ /* select an IOMMU type which we will be using */
+ t = vfio_set_iommu_type(vfio_container_fd);
+ if (!t) {
+ RTE_LOG(ERR, EAL,
+ " %s failed to select IOMMU type\n",
+ dev_addr);
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+ /* lock memory hotplug before mapping and release it
+ * after registering callback, to prevent races
+ */
+ rte_mcfg_mem_read_lock();
+ if (vfio_cfg == default_vfio_cfg)
+ ret = t->dma_map_func(vfio_container_fd);
+ else
+ ret = 0;
+ if (ret) {
+ RTE_LOG(ERR, EAL,
+ " %s DMA remapping failed, error %i (%s)\n",
+ dev_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ rte_mcfg_mem_read_unlock();
+ return -1;
+ }
+
+ vfio_cfg->vfio_iommu_type = t;
+
+ /* re-map all user-mapped segments */
+ rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+ /* this IOMMU type may not support DMA mapping, but
+ * if we have mappings in the list - that means we have
+ * previously mapped something successfully, so we can
+ * be sure that DMA mapping is supported.
+ */
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ struct user_mem_map *map;
+ map = &user_mem_maps->maps[i];
+
+ ret = t->dma_user_map_func(
+ vfio_container_fd,
+ map->addr, map->iova, map->len,
+ 1);
+ if (ret) {
+ RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
+ "va: 0x%" PRIx64 " "
+ "iova: 0x%" PRIx64 " "
+ "len: 0x%" PRIu64 "\n",
+ map->addr, map->iova,
+ map->len);
+ rte_spinlock_recursive_unlock(
+ &user_mem_maps->lock);
+ rte_mcfg_mem_read_unlock();
+ return -1;
+ }
+ }
+ rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+
+ /* register callback for mem events */
+ if (vfio_cfg == default_vfio_cfg)
+ ret = rte_mem_event_callback_register(
+ VFIO_MEM_EVENT_CLB_NAME,
+ vfio_mem_event_callback, NULL);
+ else
+ ret = 0;
+ /* unlock memory hotplug */
+ rte_mcfg_mem_read_unlock();
+
+ if (ret && rte_errno != ENOTSUP) {
+ RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+ return -1;
+ }
+ if (ret)
+ RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+ else
+ RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
+ }
+ } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
+ vfio_cfg == default_vfio_cfg &&
+ vfio_cfg->vfio_iommu_type == NULL) {
+ /* if we're not a primary process, we do not set up the VFIO
+ * container because it's already been set up by the primary
+ * process. instead, we simply ask the primary about VFIO type
+ * we are using, and set the VFIO config up appropriately.
+ */
+ ret = vfio_sync_default_container();
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+ /* we have successfully initialized VFIO, notify user */
+ const struct vfio_iommu_type *t =
+ default_vfio_cfg->vfio_iommu_type;
+ RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
+ t->type_id, t->name);
+ }
+
+ /* get a file descriptor for the device */
+ *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+ if (*vfio_dev_fd < 0) {
+ /* if we cannot get a device fd, this implies a problem with
+ * the VFIO group or the container not having IOMMU configured.
+ */
+
+ RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
+ dev_addr);
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+
+ /* test and setup the device */
+ ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " %s cannot get device info, "
+ "error %i (%s)\n", dev_addr, errno,
+ strerror(errno));
+ close(*vfio_dev_fd);
+ close(vfio_group_fd);
+ rte_vfio_clear_group(vfio_group_fd);
+ return -1;
+ }
+ vfio_group_device_get(vfio_group_fd);
+
+ return 0;
+}
+
+int
+rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
+ int vfio_dev_fd)
+{
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+ struct vfio_config *vfio_cfg;
+ int vfio_group_fd;
+ int iommu_group_num;
+ int ret;
+
+ /* we don't want any DMA mapping messages to come while we're detaching
+ * VFIO device, because this might be the last device and we might need
+ * to unregister the callback.
+ */
+ rte_mcfg_mem_read_lock();
+
+ /* get group number */
+ ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
+ if (ret <= 0) {
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
+ dev_addr);
+ /* This is an error at this point. */
+ ret = -1;
+ goto out;
+ }
+
+ /* get the actual group fd */
+ vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+ if (vfio_group_fd <= 0) {
+ RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
+ dev_addr);
+ ret = -1;
+ goto out;
+ }
+
+ /* get the vfio_config it belongs to */
+ vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+ vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+ /* At this point we got an active group. Closing it will make the
+ * container detachment. If this is the last active group, VFIO kernel
+ * code will unset the container and the IOMMU mappings.
+ */
+
+ /* Closing a device */
+ if (close(vfio_dev_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+ dev_addr);
+ ret = -1;
+ goto out;
+ }
+
+ /* An VFIO group can have several devices attached. Just when there is
+ * no devices remaining should the group be closed.
+ */
+ vfio_group_device_put(vfio_group_fd);
+ if (!vfio_group_device_count(vfio_group_fd)) {
+
+ if (close(vfio_group_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+ dev_addr);
+ ret = -1;
+ goto out;
+ }
+
+ if (rte_vfio_clear_group(vfio_group_fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+ dev_addr);
+ ret = -1;
+ goto out;
+ }
+ }
+
+ /* if there are no active device groups, unregister the callback to
+ * avoid spurious attempts to map/unmap memory from VFIO.
+ */
+ if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
+ rte_eal_process_type() != RTE_PROC_SECONDARY)
+ rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
+ NULL);
+
+ /* success */
+ ret = 0;
+
+out:
+ rte_mcfg_mem_read_unlock();
+ return ret;
+}
+
+int
+rte_vfio_enable(const char *modname)
+{
+ /* initialize group list */
+ int i, j;
+ int vfio_available;
+
+ rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
+
+ for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+ vfio_cfgs[i].vfio_container_fd = -1;
+ vfio_cfgs[i].vfio_active_groups = 0;
+ vfio_cfgs[i].vfio_iommu_type = NULL;
+ vfio_cfgs[i].mem_maps.lock = lock;
+
+ for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+ vfio_cfgs[i].vfio_groups[j].fd = -1;
+ vfio_cfgs[i].vfio_groups[j].group_num = -1;
+ vfio_cfgs[i].vfio_groups[j].devices = 0;
+ }
+ }
+
+ /* inform the user that we are probing for VFIO */
+ RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+ /* check if vfio module is loaded */
+ vfio_available = rte_eal_check_module(modname);
+
+ /* return error directly */
+ if (vfio_available == -1) {
+ RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
+ return -1;
+ }
+
+ /* return 0 if VFIO modules not loaded */
+ if (vfio_available == 0) {
+ RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
+ "skipping VFIO support...\n");
+ return 0;
+ }
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* open a new container */
+ default_vfio_cfg->vfio_container_fd =
+ rte_vfio_get_container_fd();
+ } else {
+ /* get the default container from the primary process */
+ default_vfio_cfg->vfio_container_fd =
+ vfio_get_default_container_fd();
+ }
+
+ /* check if we have VFIO driver enabled */
+ if (default_vfio_cfg->vfio_container_fd != -1) {
+ RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
+ default_vfio_cfg->vfio_enabled = 1;
+ } else {
+ RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+ }
+
+ return 0;
+}
+
+int
+rte_vfio_is_enabled(const char *modname)
+{
+ const int mod_available = rte_eal_check_module(modname) > 0;
+ return default_vfio_cfg->vfio_enabled && mod_available;
+}
+
+int
+vfio_get_default_container_fd(void)
+{
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+ if (default_vfio_cfg->vfio_enabled)
+ return default_vfio_cfg->vfio_container_fd;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ /* if we were secondary process we would try requesting
+ * container fd from the primary, but we're the primary
+ * process so just exit here
+ */
+ return -1;
+ }
+
+ p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ free(mp_reply.msgs);
+ return mp_rep->fds[0];
+ }
+ }
+
+ free(mp_reply.msgs);
+ RTE_LOG(ERR, EAL, " cannot request default container fd\n");
+ return -1;
+}
+
+int
+vfio_get_iommu_type(void)
+{
+ if (default_vfio_cfg->vfio_iommu_type == NULL)
+ return -1;
+
+ return default_vfio_cfg->vfio_iommu_type->type_id;
+}
+
+const struct vfio_iommu_type *
+vfio_set_iommu_type(int vfio_container_fd)
+{
+ unsigned idx;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_type *t = &iommu_types[idx];
+
+ int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+ t->type_id);
+ if (!ret) {
+ RTE_LOG(NOTICE, EAL, " using IOMMU type %d (%s)\n",
+ t->type_id, t->name);
+ return t;
+ }
+ /* not an error, there may be more supported IOMMU types */
+ RTE_LOG(DEBUG, EAL, " set IOMMU type %d (%s) failed, "
+ "error %i (%s)\n", t->type_id, t->name, errno,
+ strerror(errno));
+ }
+ /* if we didn't find a suitable IOMMU type, fail */
+ return NULL;
+}
+
+int
+vfio_has_supported_extensions(int vfio_container_fd)
+{
+ int ret;
+ unsigned idx, n_extensions = 0;
+ for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+ const struct vfio_iommu_type *t = &iommu_types[idx];
+
+ ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+ t->type_id);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL, " could not get IOMMU type, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ close(vfio_container_fd);
+ return -1;
+ } else if (ret == 1) {
+ /* we found a supported extension */
+ n_extensions++;
+ }
+ RTE_LOG(DEBUG, EAL, " IOMMU type %d (%s) is %s\n",
+ t->type_id, t->name,
+ ret ? "supported" : "not supported");
+ }
+
+ /* if we didn't find any supported IOMMU types, fail */
+ if (!n_extensions) {
+ close(vfio_container_fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+rte_vfio_get_container_fd(void)
+{
+ int ret, vfio_container_fd;
+ struct rte_mp_msg mp_req, *mp_rep;
+ struct rte_mp_reply mp_reply = {0};
+ struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+ struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+
+ /* if we're in a primary process, try to open the container */
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+ vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+ if (vfio_container_fd < 0) {
+ RTE_LOG(ERR, EAL, " cannot open VFIO container, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* check VFIO API version */
+ ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+ if (ret != VFIO_API_VERSION) {
+ if (ret < 0)
+ RTE_LOG(ERR, EAL, " could not get VFIO API version, "
+ "error %i (%s)\n", errno, strerror(errno));
+ else
+ RTE_LOG(ERR, EAL, " unsupported VFIO API version!\n");
+ close(vfio_container_fd);
+ return -1;
+ }
+
+ ret = vfio_has_supported_extensions(vfio_container_fd);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " no supported IOMMU "
+ "extensions found!\n");
+ return -1;
+ }
+
+ return vfio_container_fd;
+ }
+ /*
+ * if we're in a secondary process, request container fd from the
+ * primary process via mp channel
+ */
+ p->req = SOCKET_REQ_CONTAINER;
+ strcpy(mp_req.name, EAL_VFIO_MP);
+ mp_req.len_param = sizeof(*p);
+ mp_req.num_fds = 0;
+
+ vfio_container_fd = -1;
+ if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+ mp_reply.nb_received == 1) {
+ mp_rep = &mp_reply.msgs[0];
+ p = (struct vfio_mp_param *)mp_rep->param;
+ if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+ vfio_container_fd = mp_rep->fds[0];
+ free(mp_reply.msgs);
+ return vfio_container_fd;
+ }
+ }
+
+ free(mp_reply.msgs);
+ RTE_LOG(ERR, EAL, " cannot request container fd\n");
+ return -1;
+}
+
+int
+rte_vfio_get_group_num(const char *sysfs_base,
+ const char *dev_addr, int *iommu_group_num)
+{
+ char linkname[PATH_MAX];
+ char filename[PATH_MAX];
+ char *tok[16], *group_tok, *end;
+ int ret;
+
+ memset(linkname, 0, sizeof(linkname));
+ memset(filename, 0, sizeof(filename));
+
+ /* try to find out IOMMU group for this device */
+ snprintf(linkname, sizeof(linkname),
+ "%s/%s/iommu_group", sysfs_base, dev_addr);
+
+ ret = readlink(linkname, filename, sizeof(filename));
+
+ /* if the link doesn't exist, no VFIO for us */
+ if (ret < 0)
+ return 0;
+
+ ret = rte_strsplit(filename, sizeof(filename),
+ tok, RTE_DIM(tok), '/');
+
+ if (ret <= 0) {
+ RTE_LOG(ERR, EAL, " %s cannot get IOMMU group\n", dev_addr);
+ return -1;
+ }
+
+ /* IOMMU group is always the last token */
+ errno = 0;
+ group_tok = tok[ret - 1];
+ end = group_tok;
+ *iommu_group_num = strtol(group_tok, &end, 10);
+ if ((end != group_tok && *end != '\0') || errno != 0) {
+ RTE_LOG(ERR, EAL, " %s error parsing IOMMU number!\n", dev_addr);
+ return -1;
+ }
+
+ return 1;
+}
+
+static int
+type1_map_contig(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ size_t len, void *arg)
+{
+ int *vfio_container_fd = arg;
+
+ if (msl->external)
+ return 0;
+
+ return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ len, 1);
+}
+
+static int
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+ void *arg)
+{
+ int *vfio_container_fd = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ /* if IOVA mode is VA, we've already mapped the internal segments */
+ if (!msl->external && rte_eal_iova_mode() == RTE_IOVA_VA)
+ return 0;
+
+ return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ ms->len, 1);
+}
+
+static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+
+ if (do_map != 0) {
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ /**
+ * In case the mapping was already done EEXIST will be
+ * returned from kernel.
+ */
+ if (errno == EEXIST) {
+ RTE_LOG(DEBUG, EAL,
+ " Memory segment is already mapped,"
+ " skipping");
+ } else {
+ RTE_LOG(ERR, EAL,
+ " cannot set up DMA remapping,"
+ " error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+ if (rte_eal_iova_mode() == RTE_IOVA_VA) {
+ /* with IOVA as VA mode, we can get away with mapping contiguous
+ * chunks rather than going page-by-page.
+ */
+ int ret = rte_memseg_contig_walk(type1_map_contig,
+ &vfio_container_fd);
+ if (ret)
+ return ret;
+ /* we have to continue the walk because we've skipped the
+ * external segments during the config walk.
+ */
+ }
+ return rte_memseg_walk(type1_map, &vfio_container_fd);
+}
+
+static int
+vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct vfio_iommu_type1_dma_map dma_map;
+ struct vfio_iommu_type1_dma_unmap dma_unmap;
+ int ret;
+ struct vfio_iommu_spapr_register_memory reg = {
+ .argsz = sizeof(reg),
+ .flags = 0
+ };
+ reg.vaddr = (uintptr_t) vaddr;
+ reg.size = len;
+
+ if (do_map != 0) {
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = vaddr;
+ dma_map.size = len;
+ dma_map.iova = iova;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ if (ret) {
+ /**
+ * In case the mapping was already done EBUSY will be
+ * returned from kernel.
+ */
+ if (errno == EBUSY) {
+ RTE_LOG(DEBUG, EAL,
+ " Memory segment is already mapped,"
+ " skipping");
+ } else {
+ RTE_LOG(ERR, EAL,
+ " cannot set up DMA remapping,"
+ " error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
+ }
+
+ } else {
+ memset(&dma_unmap, 0, sizeof(dma_unmap));
+ dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+ dma_unmap.size = len;
+ dma_unmap.iova = iova;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+ &dma_unmap);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot clear DMA remapping, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, ®);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot unregister vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg)
+{
+ int *vfio_container_fd = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ ms->len, 1);
+}
+
+static int
+vfio_spapr_unmap_walk(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg)
+{
+ int *vfio_container_fd = arg;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+ ms->len, 0);
+}
+
+struct spapr_walk_param {
+ uint64_t window_size;
+ uint64_t hugepage_sz;
+};
+
+static int
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
+ const struct rte_memseg *ms, void *arg)
+{
+ struct spapr_walk_param *param = arg;
+ uint64_t max = ms->iova + ms->len;
+
+ /* skip external memory that isn't a heap */
+ if (msl->external && !msl->heap)
+ return 0;
+
+ /* skip any segments with invalid IOVA addresses */
+ if (ms->iova == RTE_BAD_IOVA)
+ return 0;
+
+ if (max > param->window_size) {
+ param->hugepage_sz = ms->hugepage_sz;
+ param->window_size = max;
+ }
+
+ return 0;
+}
+
+static int
+vfio_spapr_create_new_dma_window(int vfio_container_fd,
+ struct vfio_iommu_spapr_tce_create *create) {
+ struct vfio_iommu_spapr_tce_remove remove = {
+ .argsz = sizeof(remove),
+ };
+ struct vfio_iommu_spapr_tce_info info = {
+ .argsz = sizeof(info),
+ };
+ int ret;
+
+ /* query spapr iommu info */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot get iommu info, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* remove default DMA of 32 bit window */
+ remove.start_addr = info.dma32_window_start;
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot remove default DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ /* create new DMA window */
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+ if (ret) {
+#ifdef VFIO_IOMMU_SPAPR_INFO_DDW
+ /* try possible page_shift and levels for workaround */
+ uint32_t levels;
+
+ for (levels = create->levels + 1;
+ ret && levels <= info.ddw.levels; levels++) {
+ create->levels = levels;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+ }
+#endif
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot create new DMA window, "
+ "error %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+ }
+
+ if (create->start_addr != 0) {
+ RTE_LOG(ERR, EAL, " DMA window start address != 0\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ struct spapr_walk_param param;
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+ struct vfio_config *vfio_cfg;
+ struct user_mem_maps *user_mem_maps;
+ int i, ret = 0;
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, " invalid container fd!\n");
+ return -1;
+ }
+
+ user_mem_maps = &vfio_cfg->mem_maps;
+ rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+ /* check if window size needs to be adjusted */
+ memset(¶m, 0, sizeof(param));
+
+ /* we're inside a callback so use thread-unsafe version */
+ if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
+ ¶m) < 0) {
+ RTE_LOG(ERR, EAL, "Could not get window size\n");
+ ret = -1;
+ goto out;
+ }
+
+ /* also check user maps */
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ uint64_t max = user_mem_maps->maps[i].iova +
+ user_mem_maps->maps[i].len;
+ param.window_size = RTE_MAX(param.window_size, max);
+ }
+
+ /* sPAPR requires window size to be a power of 2 */
+ create.window_size = rte_align64pow2(param.window_size);
+ create.page_shift = __builtin_ctzll(param.hugepage_sz);
+ create.levels = 1;
+
+ if (do_map) {
+ /* re-create window and remap the entire memory */
+ if (iova + len > create.window_size) {
+ /* release all maps before recreating the window */
+ if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk,
+ &vfio_container_fd) < 0) {
+ RTE_LOG(ERR, EAL, "Could not release DMA maps\n");
+ ret = -1;
+ goto out;
+ }
+ /* release all user maps */
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ struct user_mem_map *map =
+ &user_mem_maps->maps[i];
+ if (vfio_spapr_dma_do_map(vfio_container_fd,
+ map->addr, map->iova, map->len,
+ 0)) {
+ RTE_LOG(ERR, EAL, "Could not release user DMA maps\n");
+ ret = -1;
+ goto out;
+ }
+ }
+ create.window_size = rte_align64pow2(iova + len);
+ if (vfio_spapr_create_new_dma_window(vfio_container_fd,
+ &create) < 0) {
+ RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+ ret = -1;
+ goto out;
+ }
+ /* we're inside a callback, so use thread-unsafe version
+ */
+ if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
+ &vfio_container_fd) < 0) {
+ RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
+ ret = -1;
+ goto out;
+ }
+ /* remap all user maps */
+ for (i = 0; i < user_mem_maps->n_maps; i++) {
+ struct user_mem_map *map =
+ &user_mem_maps->maps[i];
+ if (vfio_spapr_dma_do_map(vfio_container_fd,
+ map->addr, map->iova, map->len,
+ 1)) {
+ RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
+ ret = -1;
+ goto out;
+ }
+ }
+ }
+ if (vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 1)) {
+ RTE_LOG(ERR, EAL, "Failed to map DMA\n");
+ ret = -1;
+ goto out;
+ }
+ } else {
+ /* for unmap, check if iova within DMA window */
+ if (iova > create.window_size) {
+ RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
+ ret = -1;
+ goto out;
+ }
+
+ vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
+ }
+out:
+ rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+ return ret;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+ struct vfio_iommu_spapr_tce_create create = {
+ .argsz = sizeof(create),
+ };
+ struct spapr_walk_param param;
+
+ memset(¶m, 0, sizeof(param));
+
+ /* create DMA window from 0 to max(phys_addr + len) */
+ rte_memseg_walk(vfio_spapr_window_size_walk, ¶m);
+
+ /* sPAPR requires window size to be a power of 2 */
+ create.window_size = rte_align64pow2(param.window_size);
+ create.page_shift = __builtin_ctzll(param.hugepage_sz);
+ create.levels = 1;
+
+ if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
+ RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+ return -1;
+ }
+
+ /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+ if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+ uint64_t __rte_unused vaddr,
+ uint64_t __rte_unused iova, uint64_t __rte_unused len,
+ int __rte_unused do_map)
+{
+ /* No-IOMMU mode does not need DMA mapping */
+ return 0;
+}
+
+static int
+vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map)
+{
+ const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
+
+ if (!t) {
+ RTE_LOG(ERR, EAL, " VFIO support not initialized\n");
+ rte_errno = ENODEV;
+ return -1;
+ }
+
+ if (!t->dma_user_map_func) {
+ RTE_LOG(ERR, EAL,
+ " VFIO custom DMA region maping not supported by IOMMU %s\n",
+ t->name);
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
+ len, do_map);
+}
+
+static int
+container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len)
+{
+ struct user_mem_map *new_map;
+ struct user_mem_maps *user_mem_maps;
+ int ret = 0;
+
+ user_mem_maps = &vfio_cfg->mem_maps;
+ rte_spinlock_recursive_lock(&user_mem_maps->lock);
+ if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+ RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+ /* map the entry */
+ if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+ /* technically, this will fail if there are currently no devices
+ * plugged in, even if a device were added later, this mapping
+ * might have succeeded. however, since we cannot verify if this
+ * is a valid mapping without having a device attached, consider
+ * this to be unsupported, because we can't just store any old
+ * mapping and pollute list of active mappings willy-nilly.
+ */
+ RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+ ret = -1;
+ goto out;
+ }
+ /* create new user mem map entry */
+ new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+ new_map->addr = vaddr;
+ new_map->iova = iova;
+ new_map->len = len;
+
+ compact_user_maps(user_mem_maps);
+out:
+ rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+ return ret;
+}
+
+static int
+container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+ uint64_t len)
+{
+ struct user_mem_map *map, *new_map = NULL;
+ struct user_mem_maps *user_mem_maps;
+ int ret = 0;
+
+ user_mem_maps = &vfio_cfg->mem_maps;
+ rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+ /* find our mapping */
+ map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
+ if (!map) {
+ RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+ rte_errno = EINVAL;
+ ret = -1;
+ goto out;
+ }
+ if (map->addr != vaddr || map->iova != iova || map->len != len) {
+ /* we're partially unmapping a previously mapped region, so we
+ * need to split entry into two.
+ */
+ if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+ RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+ rte_errno = ENOMEM;
+ ret = -1;
+ goto out;
+ }
+ new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+ }
+
+ /* unmap the entry */
+ if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+ /* there may not be any devices plugged in, so unmapping will
+ * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+ * stop us from removing the mapping, as the assumption is we
+ * won't be needing this memory any more and thus will want to
+ * prevent it from being remapped again on hotplug. so, only
+ * fail if we indeed failed to unmap (e.g. if the mapping was
+ * within our mapped range but had invalid alignment).
+ */
+ if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+ RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+ ret = -1;
+ goto out;
+ } else {
+ RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+ }
+ }
+ /* remove map from the list of active mappings */
+ if (new_map != NULL) {
+ adjust_map(map, new_map, vaddr, len);
+
+ /* if we've created a new map by splitting, sort everything */
+ if (!is_null_map(new_map)) {
+ compact_user_maps(user_mem_maps);
+ } else {
+ /* we've created a new mapping, but it was unused */
+ user_mem_maps->n_maps--;
+ }
+ } else {
+ memset(map, 0, sizeof(*map));
+ compact_user_maps(user_mem_maps);
+ user_mem_maps->n_maps--;
+ }
+
+out:
+ rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+ return ret;
+}
+
+int
+rte_vfio_noiommu_is_enabled(void)
+{
+ int fd;
+ ssize_t cnt;
+ char c;
+
+ fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
+ if (fd < 0) {
+ if (errno != ENOENT) {
+ RTE_LOG(ERR, EAL, " cannot open vfio noiommu file %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+ /*
+ * else the file does not exists
+ * i.e. noiommu is not enabled
+ */
+ return 0;
+ }
+
+ cnt = read(fd, &c, 1);
+ close(fd);
+ if (cnt != 1) {
+ RTE_LOG(ERR, EAL, " unable to read from vfio noiommu "
+ "file %i (%s)\n", errno, strerror(errno));
+ return -1;
+ }
+
+ return c == 'Y';
+}
+
+int
+rte_vfio_container_create(void)
+{
+ int i;
+
+ /* Find an empty slot to store new vfio config */
+ for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+ if (vfio_cfgs[i].vfio_container_fd == -1)
+ break;
+ }
+
+ if (i == VFIO_MAX_CONTAINERS) {
+ RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+ return -1;
+ }
+
+ vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
+ if (vfio_cfgs[i].vfio_container_fd < 0) {
+ RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+ return -1;
+ }
+
+ return vfio_cfgs[i].vfio_container_fd;
+}
+
+int
+rte_vfio_container_destroy(int container_fd)
+{
+ struct vfio_config *vfio_cfg;
+ int i;
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, "Invalid container fd\n");
+ return -1;
+ }
+
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg->vfio_groups[i].group_num != -1)
+ rte_vfio_container_group_unbind(container_fd,
+ vfio_cfg->vfio_groups[i].group_num);
+
+ close(container_fd);
+ vfio_cfg->vfio_container_fd = -1;
+ vfio_cfg->vfio_active_groups = 0;
+ vfio_cfg->vfio_iommu_type = NULL;
+
+ return 0;
+}
+
+int
+rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, "Invalid container fd\n");
+ return -1;
+ }
+
+ return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
+int
+rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
+{
+ struct vfio_config *vfio_cfg;
+ struct vfio_group *cur_grp = NULL;
+ int i;
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, "Invalid container fd\n");
+ return -1;
+ }
+
+ for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+ if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
+ cur_grp = &vfio_cfg->vfio_groups[i];
+ break;
+ }
+ }
+
+ /* This should not happen */
+ if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
+ RTE_LOG(ERR, EAL, "Specified group number not found\n");
+ return -1;
+ }
+
+ if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+ RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+ " iommu_group_num %d\n", iommu_group_num);
+ return -1;
+ }
+ cur_grp->group_num = -1;
+ cur_grp->fd = -1;
+ cur_grp->devices = 0;
+ vfio_cfg->vfio_active_groups--;
+
+ return 0;
+}
+
+int
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len)
+{
+ struct vfio_config *vfio_cfg;
+
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, "Invalid container fd\n");
+ return -1;
+ }
+
+ return container_dma_map(vfio_cfg, vaddr, iova, len);
+}
+
+int
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len)
+{
+ struct vfio_config *vfio_cfg;
+
+ if (len == 0) {
+ rte_errno = EINVAL;
+ return -1;
+ }
+
+ vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+ if (vfio_cfg == NULL) {
+ RTE_LOG(ERR, EAL, "Invalid container fd\n");
+ return -1;
+ }
+
+ return container_dma_unmap(vfio_cfg, vaddr, iova, len);
+}
+
+#else
+
+int
+rte_vfio_setup_device(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *vfio_dev_fd,
+ __rte_unused struct vfio_device_info *device_info)
+{
+ return -1;
+}
+
+int
+rte_vfio_release_device(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr, __rte_unused int fd)
+{
+ return -1;
+}
+
+int
+rte_vfio_enable(__rte_unused const char *modname)
+{
+ return -1;
+}
+
+int
+rte_vfio_is_enabled(__rte_unused const char *modname)
+{
+ return -1;
+}
+
+int
+rte_vfio_noiommu_is_enabled(void)
+{
+ return -1;
+}
+
+int
+rte_vfio_clear_group(__rte_unused int vfio_group_fd)
+{
+ return -1;
+}
+
+int
+rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
+ __rte_unused const char *dev_addr,
+ __rte_unused int *iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_get_container_fd(void)
+{
+ return -1;
+}
+
+int
+rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_create(void)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+ __rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+ __rte_unused int iommu_group_num)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+ __rte_unused uint64_t vaddr,
+ __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+int
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+ __rte_unused uint64_t vaddr,
+ __rte_unused uint64_t iova,
+ __rte_unused uint64_t len)
+{
+ return -1;
+}
+
+#endif /* VFIO_PRESENT */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef EAL_VFIO_H_
+#define EAL_VFIO_H_
+
+#include <rte_common.h>
+
+/*
+ * determine if VFIO is present on the system
+ */
+#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#define VFIO_PRESENT
+#else
+#pragma message("VFIO configured but not supported by this kernel, disabling.")
+#endif /* kernel version >= 3.6.0 */
+#endif /* RTE_EAL_VFIO */
+
+#ifdef VFIO_PRESENT
+
+#include <stdint.h>
+#include <linux/vfio.h>
+
+#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+
+#ifndef VFIO_SPAPR_TCE_v2_IOMMU
+#define RTE_VFIO_SPAPR 7
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
+#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+struct vfio_iommu_spapr_register_memory {
+ uint32_t argsz;
+ uint32_t flags;
+ uint64_t vaddr;
+ uint64_t size;
+};
+
+struct vfio_iommu_spapr_tce_create {
+ uint32_t argsz;
+ uint32_t flags;
+ /* in */
+ uint32_t page_shift;
+ uint32_t __resv1;
+ uint64_t window_size;
+ uint32_t levels;
+ uint32_t __resv2;
+ /* out */
+ uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_remove {
+ uint32_t argsz;
+ uint32_t flags;
+ /* in */
+ uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_ddw_info {
+ uint64_t pgsizes;
+ uint32_t max_dynamic_windows_supported;
+ uint32_t levels;
+};
+
+/* SPAPR_v2 is not present, but SPAPR might be */
+#ifndef VFIO_SPAPR_TCE_IOMMU
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct vfio_iommu_spapr_tce_info {
+ uint32_t argsz;
+ uint32_t flags;
+ uint32_t dma32_window_start;
+ uint32_t dma32_window_size;
+ struct vfio_iommu_spapr_tce_ddw_info ddw;
+};
+#endif /* VFIO_SPAPR_TCE_IOMMU */
+
+#else /* VFIO_SPAPR_TCE_v2_IOMMU */
+#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
+#endif
+
+#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
+#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+ int group_num;
+ int fd;
+ int devices;
+};
+
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+ uint64_t len, int do_map);
+
+struct vfio_iommu_type {
+ int type_id;
+ const char *name;
+ vfio_dma_user_func_t dma_user_map_func;
+ vfio_dma_func_t dma_map_func;
+};
+
+/* get the vfio container that devices are bound to by default */
+int vfio_get_default_container_fd(void);
+
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+const struct vfio_iommu_type *
+vfio_set_iommu_type(int vfio_container_fd);
+
+int
+vfio_get_iommu_type(void);
+
+/* check if we have any supported extensions */
+int
+vfio_has_supported_extensions(int vfio_container_fd);
+
+int vfio_mp_sync_setup(void);
+
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
+#define SOCKET_REQ_IOMMU_TYPE 0x800
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+struct vfio_mp_param {
+ int req;
+ int result;
+ RTE_STD_C11
+ union {
+ int group_num;
+ int iommu_type_id;
+ };
+};
+
+#endif /* VFIO_PRESENT */
+
+#endif /* EAL_VFIO_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_compat.h>
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_eal.h>
+
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
+{
+ int fd = -1;
+ int ret;
+ struct rte_mp_msg reply;
+ struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+ const struct vfio_mp_param *m =
+ (const struct vfio_mp_param *)msg->param;
+
+ if (msg->len_param != sizeof(*m)) {
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+ return -1;
+ }
+
+ memset(&reply, 0, sizeof(reply));
+
+ switch (m->req) {
+ case SOCKET_REQ_GROUP:
+ r->req = SOCKET_REQ_GROUP;
+ r->group_num = m->group_num;
+ fd = rte_vfio_get_group_fd(m->group_num);
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else if (fd == 0)
+ /* if VFIO group exists but isn't bound to VFIO driver */
+ r->result = SOCKET_NO_FD;
+ else {
+ /* if group exists and is bound to VFIO driver */
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ case SOCKET_REQ_CONTAINER:
+ r->req = SOCKET_REQ_CONTAINER;
+ fd = rte_vfio_get_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ case SOCKET_REQ_DEFAULT_CONTAINER:
+ r->req = SOCKET_REQ_DEFAULT_CONTAINER;
+ fd = vfio_get_default_container_fd();
+ if (fd < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->result = SOCKET_OK;
+ reply.num_fds = 1;
+ reply.fds[0] = fd;
+ }
+ break;
+ case SOCKET_REQ_IOMMU_TYPE:
+ {
+ int iommu_type_id;
+
+ r->req = SOCKET_REQ_IOMMU_TYPE;
+
+ iommu_type_id = vfio_get_iommu_type();
+
+ if (iommu_type_id < 0)
+ r->result = SOCKET_ERR;
+ else {
+ r->iommu_type_id = iommu_type_id;
+ r->result = SOCKET_OK;
+ }
+ break;
+ }
+ default:
+ RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+ return -1;
+ }
+
+ strcpy(reply.name, EAL_VFIO_MP);
+ reply.len_param = sizeof(*r);
+
+ ret = rte_mp_reply(&reply, peer);
+ if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+ close(fd);
+ return ret;
+}
+
+int
+vfio_mp_sync_setup(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ int ret = rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
+ if (ret && rte_errno != ENOTSUP)
+ return -1;
+ }
+
+ return 0;
+}
+
+#endif
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020 Mellanox Technologies, Ltd
+
+includes += include_directories('.')
+
+headers += files(
+ 'rte_kni_common.h',
+ 'rte_os.h',
+)
--- /dev/null
+/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */
+/*
+ * Copyright(c) 2007-2014 Intel Corporation.
+ */
+
+#ifndef _RTE_KNI_COMMON_H_
+#define _RTE_KNI_COMMON_H_
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#include <asm/barrier.h>
+#define RTE_STD_C11
+#else
+#include <rte_common.h>
+#include <rte_config.h>
+#endif
+
+/*
+ * KNI name is part of memzone name. Must not exceed IFNAMSIZ.
+ */
+#define RTE_KNI_NAMESIZE 16
+
+#define RTE_CACHE_LINE_MIN_SIZE 64
+
+/*
+ * Request id.
+ */
+enum rte_kni_req_id {
+ RTE_KNI_REQ_UNKNOWN = 0,
+ RTE_KNI_REQ_CHANGE_MTU,
+ RTE_KNI_REQ_CFG_NETWORK_IF,
+ RTE_KNI_REQ_CHANGE_MAC_ADDR,
+ RTE_KNI_REQ_CHANGE_PROMISC,
+ RTE_KNI_REQ_CHANGE_ALLMULTI,
+ RTE_KNI_REQ_MAX,
+};
+
+/*
+ * Structure for KNI request.
+ */
+struct rte_kni_request {
+ uint32_t req_id; /**< Request id */
+ RTE_STD_C11
+ union {
+ uint32_t new_mtu; /**< New MTU */
+ uint8_t if_up; /**< 1: interface up, 0: interface down */
+ uint8_t mac_addr[6]; /**< MAC address for interface */
+ uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */
+ uint8_t allmulti; /**< 1: all-multicast mode enable, 0: disable */
+ };
+ int32_t result; /**< Result for processing request */
+} __attribute__((__packed__));
+
+/*
+ * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
+ * Write and read should wrap around. Fifo is empty when write == read
+ * Writing should never overwrite the read position
+ */
+struct rte_kni_fifo {
+#ifdef RTE_USE_C11_MEM_MODEL
+ unsigned write; /**< Next position to be written*/
+ unsigned read; /**< Next position to be read */
+#else
+ volatile unsigned write; /**< Next position to be written*/
+ volatile unsigned read; /**< Next position to be read */
+#endif
+ unsigned len; /**< Circular buffer length */
+ unsigned elem_size; /**< Pointer size - for 32/64 bit OS */
+ void *volatile buffer[]; /**< The buffer contains mbuf pointers */
+};
+
+/*
+ * The kernel image of the rte_mbuf struct, with only the relevant fields.
+ * Padding is necessary to assure the offsets of these fields
+ */
+struct rte_kni_mbuf {
+ void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
+ uint64_t buf_physaddr;
+ uint16_t data_off; /**< Start address of data in segment buffer. */
+ char pad1[2];
+ uint16_t nb_segs; /**< Number of segments. */
+ char pad4[2];
+ uint64_t ol_flags; /**< Offload features. */
+ char pad2[4];
+ uint32_t pkt_len; /**< Total pkt len: sum of all segment data_len. */
+ uint16_t data_len; /**< Amount of data in segment buffer. */
+
+ /* fields on second cache line */
+ char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)));
+ void *pool;
+ void *next; /**< Physical address of next mbuf in kernel. */
+};
+
+/*
+ * Struct used to create a KNI device. Passed to the kernel in IOCTL call
+ */
+
+struct rte_kni_device_info {
+ char name[RTE_KNI_NAMESIZE]; /**< Network device name for KNI */
+
+ phys_addr_t tx_phys;
+ phys_addr_t rx_phys;
+ phys_addr_t alloc_phys;
+ phys_addr_t free_phys;
+
+ /* Used by Ethtool */
+ phys_addr_t req_phys;
+ phys_addr_t resp_phys;
+ phys_addr_t sync_phys;
+ void * sync_va;
+
+ /* mbuf mempool */
+ void * mbuf_va;
+ phys_addr_t mbuf_phys;
+
+ uint16_t group_id; /**< Group ID */
+ uint32_t core_id; /**< core ID to bind for kernel thread */
+
+ __extension__
+ uint8_t force_bind : 1; /**< Flag for kernel thread binding */
+
+ /* mbuf size */
+ unsigned mbuf_size;
+ unsigned int mtu;
+ unsigned int min_mtu;
+ unsigned int max_mtu;
+ uint8_t mac_addr[6];
+ uint8_t iova_mode;
+};
+
+#define KNI_DEVICE "kni"
+
+#define RTE_KNI_IOCTL_TEST _IOWR(0, 1, int)
+#define RTE_KNI_IOCTL_CREATE _IOWR(0, 2, struct rte_kni_device_info)
+#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info)
+
+#endif /* _RTE_KNI_COMMON_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2019 Intel Corporation
+ */
+
+#ifndef _RTE_OS_H_
+#define _RTE_OS_H_
+
+/**
+ * This is header should contain any function/macro definition
+ * which are not supported natively or named differently in the
+ * linux OS. Functions will be added in future releases.
+ */
+
+#include <sched.h>
+
+typedef cpu_set_t rte_cpuset_t;
+#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
+#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
+#define RTE_CPU_FILL(set) do \
+{ \
+ unsigned int i; \
+ CPU_ZERO(set); \
+ for (i = 0; i < CPU_SETSIZE; i++) \
+ CPU_SET(i, set); \
+} while (0)
+#define RTE_CPU_NOT(dst, src) do \
+{ \
+ cpu_set_t tmp; \
+ RTE_CPU_FILL(&tmp); \
+ CPU_XOR(dst, &tmp, src); \
+} while (0)
+
+#endif /* _RTE_OS_H_ */
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+subdir('include')
+
+sources += files('eal_alarm.c',
+ 'eal_cpuflags.c',
+ 'eal_debug.c',
+ 'eal_hugepage_info.c',
+ 'eal_interrupts.c',
+ 'eal_memalloc.c',
+ 'eal_lcore.c',
+ 'eal_log.c',
+ 'eal_thread.c',
+ 'eal_timer.c',
+ 'eal_vfio.c',
+ 'eal_vfio_mp_sync.c',
+ 'eal.c',
+ 'eal_memory.c',
+ 'eal_dev.c',
+)
+
+deps += ['kvargs']
+if has_libnuma == 1
+ dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)
+endif
# SPDX-License-Identifier: BSD-3-Clause
# Copyright(c) 2017-2019 Intel Corporation
-# Custom EAL processing. EAL is complicated enough that it can't just
-# have a straight list of headers and source files.
-# Initially pull in common settings
-eal_inc = [global_inc]
+includes += global_inc
subdir('include')
subdir('common')
-# Now do OS/exec-env specific settings, including building kernel modules
-# The <exec-env>/eal/meson.build file should define env_sources, etc.
dpdk_conf.set('RTE_EXEC_ENV_' + exec_env.to_upper(), 1)
-subdir(exec_env + '/eal')
+subdir(exec_env)
subdir(arch_subdir)
if cc.has_header('getopt.h')
cflags += ['-DHAVE_GETOPT_H', '-DHAVE_GETOPT', '-DHAVE_GETOPT_LONG']
endif
-sources += env_sources
-objs = env_objs
-headers += env_headers
-includes += eal_inc
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <fcntl.h>
+#include <io.h>
+#include <share.h>
+#include <sys/stat.h>
+
+#include <rte_debug.h>
+#include <rte_eal.h>
+#include <eal_memcfg.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <eal_thread.h>
+#include <eal_internal_cfg.h>
+#include <eal_filesystem.h>
+#include <eal_options.h>
+#include <eal_private.h>
+
+ /* Allow the application to print its usage message too if set */
+static rte_usage_hook_t rte_application_usage_hook;
+
+/* define fd variable here, because file needs to be kept open for the
+ * duration of the program, as we hold a write lock on it in the primary proc
+ */
+static int mem_cfg_fd = -1;
+
+/* early configuration structure, when memory config is not mmapped */
+static struct rte_mem_config early_mem_config;
+
+/* Address of global and public configuration */
+static struct rte_config rte_config = {
+ .mem_config = &early_mem_config,
+};
+
+/* internal configuration (per-core) */
+struct lcore_config lcore_config[RTE_MAX_LCORE];
+
+/* internal configuration */
+struct internal_config internal_config;
+
+/* platform-specific runtime dir */
+static char runtime_dir[PATH_MAX];
+
+const char *
+rte_eal_get_runtime_dir(void)
+{
+ return runtime_dir;
+}
+
+/* Return a pointer to the configuration structure */
+struct rte_config *
+rte_eal_get_configuration(void)
+{
+ return &rte_config;
+}
+
+/* Detect if we are a primary or a secondary process */
+enum rte_proc_type_t
+eal_proc_type_detect(void)
+{
+ enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
+ const char *pathname = eal_runtime_config_path();
+
+ /* if we can open the file but not get a write-lock we are a secondary
+ * process. NOTE: if we get a file handle back, we keep that open
+ * and don't close it to prevent a race condition between multiple opens
+ */
+ errno_t err = _sopen_s(&mem_cfg_fd, pathname,
+ _O_RDWR, _SH_DENYNO, _S_IREAD | _S_IWRITE);
+ if (err == 0) {
+ OVERLAPPED soverlapped = { 0 };
+ soverlapped.Offset = sizeof(*rte_config.mem_config);
+ soverlapped.OffsetHigh = 0;
+
+ HANDLE hwinfilehandle = (HANDLE)_get_osfhandle(mem_cfg_fd);
+
+ if (!LockFileEx(hwinfilehandle,
+ LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0,
+ sizeof(*rte_config.mem_config), 0, &soverlapped))
+ ptype = RTE_PROC_SECONDARY;
+ }
+
+ RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
+ ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
+
+ return ptype;
+}
+
+/* display usage */
+static void
+eal_usage(const char *prgname)
+{
+ printf("\nUsage: %s ", prgname);
+ eal_common_usage();
+ /* Allow the application to print its usage message too
+ * if hook is set
+ */
+ if (rte_application_usage_hook) {
+ printf("===== Application Usage =====\n\n");
+ rte_application_usage_hook(prgname);
+ }
+}
+
+/* Parse the arguments for --log-level only */
+static void
+eal_log_level_parse(int argc, char **argv)
+{
+ int opt;
+ char **argvopt;
+ int option_index;
+
+ argvopt = argv;
+
+ eal_reset_internal_config(&internal_config);
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ int ret;
+
+ /* getopt is not happy, stop right now */
+ if (opt == '?')
+ break;
+
+ ret = (opt == OPT_LOG_LEVEL_NUM) ?
+ eal_parse_common_option(opt, optarg,
+ &internal_config) : 0;
+
+ /* common parser is not happy */
+ if (ret < 0)
+ break;
+ }
+
+ optind = 0; /* reset getopt lib */
+}
+
+/* Parse the argument given in the command line of the application */
+__attribute__((optnone)) static int
+eal_parse_args(int argc, char **argv)
+{
+ int opt, ret;
+ char **argvopt;
+ int option_index;
+ char *prgname = argv[0];
+
+ argvopt = argv;
+
+ while ((opt = getopt_long(argc, argvopt, eal_short_options,
+ eal_long_options, &option_index)) != EOF) {
+
+ int ret;
+
+ /* getopt is not happy, stop right now */
+ if (opt == '?') {
+ eal_usage(prgname);
+ return -1;
+ }
+
+ ret = eal_parse_common_option(opt, optarg, &internal_config);
+ /* common parser is not happy */
+ if (ret < 0) {
+ eal_usage(prgname);
+ return -1;
+ }
+ /* common parser handled this option */
+ if (ret == 0)
+ continue;
+
+ switch (opt) {
+ case 'h':
+ eal_usage(prgname);
+ exit(EXIT_SUCCESS);
+ default:
+ if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
+ RTE_LOG(ERR, EAL, "Option %c is not supported "
+ "on Windows\n", opt);
+ } else if (opt >= OPT_LONG_MIN_NUM &&
+ opt < OPT_LONG_MAX_NUM) {
+ RTE_LOG(ERR, EAL, "Option %s is not supported "
+ "on Windows\n",
+ eal_long_options[option_index].name);
+ } else {
+ RTE_LOG(ERR, EAL, "Option %d is not supported "
+ "on Windows\n", opt);
+ }
+ eal_usage(prgname);
+ return -1;
+ }
+ }
+
+ if (eal_adjust_config(&internal_config) != 0)
+ return -1;
+
+ /* sanity checks */
+ if (eal_check_common_options(&internal_config) != 0) {
+ eal_usage(prgname);
+ return -1;
+ }
+
+ if (optind >= 0)
+ argv[optind - 1] = prgname;
+ ret = optind - 1;
+ optind = 0; /* reset getopt lib */
+ return ret;
+}
+
+static int
+sync_func(void *arg __rte_unused)
+{
+ return 0;
+}
+
+static void
+rte_eal_init_alert(const char *msg)
+{
+ fprintf(stderr, "EAL: FATAL: %s\n", msg);
+ RTE_LOG(ERR, EAL, "%s\n", msg);
+}
+
+ /* Launch threads, called at application init(). */
+int
+rte_eal_init(int argc, char **argv)
+{
+ int i, fctret;
+
+ eal_log_level_parse(argc, argv);
+
+ /* create a map of all processors in the system */
+ eal_create_cpu_map();
+
+ if (rte_eal_cpu_init() < 0) {
+ rte_eal_init_alert("Cannot detect lcores.");
+ rte_errno = ENOTSUP;
+ return -1;
+ }
+
+ fctret = eal_parse_args(argc, argv);
+ if (fctret < 0)
+ exit(1);
+
+ eal_thread_init_master(rte_config.master_lcore);
+
+ RTE_LCORE_FOREACH_SLAVE(i) {
+
+ /*
+ * create communication pipes between master thread
+ * and children
+ */
+ if (_pipe(lcore_config[i].pipe_master2slave,
+ sizeof(char), _O_BINARY) < 0)
+ rte_panic("Cannot create pipe\n");
+ if (_pipe(lcore_config[i].pipe_slave2master,
+ sizeof(char), _O_BINARY) < 0)
+ rte_panic("Cannot create pipe\n");
+
+ lcore_config[i].state = WAIT;
+
+ /* create a thread for each lcore */
+ if (eal_thread_create(&lcore_config[i].thread_id) != 0)
+ rte_panic("Cannot create thread\n");
+ }
+
+ /*
+ * Launch a dummy function on all slave lcores, so that master lcore
+ * knows they are all ready when this function returns.
+ */
+ rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
+ rte_eal_mp_wait_lcore();
+ return fctret;
+}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#include <fcntl.h>
-#include <io.h>
-#include <share.h>
-#include <sys/stat.h>
-
-#include <rte_debug.h>
-#include <rte_eal.h>
-#include <eal_memcfg.h>
-#include <rte_errno.h>
-#include <rte_lcore.h>
-#include <eal_thread.h>
-#include <eal_internal_cfg.h>
-#include <eal_filesystem.h>
-#include <eal_options.h>
-#include <eal_private.h>
-
- /* Allow the application to print its usage message too if set */
-static rte_usage_hook_t rte_application_usage_hook;
-
-/* define fd variable here, because file needs to be kept open for the
- * duration of the program, as we hold a write lock on it in the primary proc
- */
-static int mem_cfg_fd = -1;
-
-/* early configuration structure, when memory config is not mmapped */
-static struct rte_mem_config early_mem_config;
-
-/* Address of global and public configuration */
-static struct rte_config rte_config = {
- .mem_config = &early_mem_config,
-};
-
-/* internal configuration (per-core) */
-struct lcore_config lcore_config[RTE_MAX_LCORE];
-
-/* internal configuration */
-struct internal_config internal_config;
-
-/* platform-specific runtime dir */
-static char runtime_dir[PATH_MAX];
-
-const char *
-rte_eal_get_runtime_dir(void)
-{
- return runtime_dir;
-}
-
-/* Return a pointer to the configuration structure */
-struct rte_config *
-rte_eal_get_configuration(void)
-{
- return &rte_config;
-}
-
-/* Detect if we are a primary or a secondary process */
-enum rte_proc_type_t
-eal_proc_type_detect(void)
-{
- enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
- const char *pathname = eal_runtime_config_path();
-
- /* if we can open the file but not get a write-lock we are a secondary
- * process. NOTE: if we get a file handle back, we keep that open
- * and don't close it to prevent a race condition between multiple opens
- */
- errno_t err = _sopen_s(&mem_cfg_fd, pathname,
- _O_RDWR, _SH_DENYNO, _S_IREAD | _S_IWRITE);
- if (err == 0) {
- OVERLAPPED soverlapped = { 0 };
- soverlapped.Offset = sizeof(*rte_config.mem_config);
- soverlapped.OffsetHigh = 0;
-
- HANDLE hwinfilehandle = (HANDLE)_get_osfhandle(mem_cfg_fd);
-
- if (!LockFileEx(hwinfilehandle,
- LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0,
- sizeof(*rte_config.mem_config), 0, &soverlapped))
- ptype = RTE_PROC_SECONDARY;
- }
-
- RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
- ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
-
- return ptype;
-}
-
-/* display usage */
-static void
-eal_usage(const char *prgname)
-{
- printf("\nUsage: %s ", prgname);
- eal_common_usage();
- /* Allow the application to print its usage message too
- * if hook is set
- */
- if (rte_application_usage_hook) {
- printf("===== Application Usage =====\n\n");
- rte_application_usage_hook(prgname);
- }
-}
-
-/* Parse the arguments for --log-level only */
-static void
-eal_log_level_parse(int argc, char **argv)
-{
- int opt;
- char **argvopt;
- int option_index;
-
- argvopt = argv;
-
- eal_reset_internal_config(&internal_config);
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- int ret;
-
- /* getopt is not happy, stop right now */
- if (opt == '?')
- break;
-
- ret = (opt == OPT_LOG_LEVEL_NUM) ?
- eal_parse_common_option(opt, optarg,
- &internal_config) : 0;
-
- /* common parser is not happy */
- if (ret < 0)
- break;
- }
-
- optind = 0; /* reset getopt lib */
-}
-
-/* Parse the argument given in the command line of the application */
-__attribute__((optnone)) static int
-eal_parse_args(int argc, char **argv)
-{
- int opt, ret;
- char **argvopt;
- int option_index;
- char *prgname = argv[0];
-
- argvopt = argv;
-
- while ((opt = getopt_long(argc, argvopt, eal_short_options,
- eal_long_options, &option_index)) != EOF) {
-
- int ret;
-
- /* getopt is not happy, stop right now */
- if (opt == '?') {
- eal_usage(prgname);
- return -1;
- }
-
- ret = eal_parse_common_option(opt, optarg, &internal_config);
- /* common parser is not happy */
- if (ret < 0) {
- eal_usage(prgname);
- return -1;
- }
- /* common parser handled this option */
- if (ret == 0)
- continue;
-
- switch (opt) {
- case 'h':
- eal_usage(prgname);
- exit(EXIT_SUCCESS);
- default:
- if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
- RTE_LOG(ERR, EAL, "Option %c is not supported "
- "on Windows\n", opt);
- } else if (opt >= OPT_LONG_MIN_NUM &&
- opt < OPT_LONG_MAX_NUM) {
- RTE_LOG(ERR, EAL, "Option %s is not supported "
- "on Windows\n",
- eal_long_options[option_index].name);
- } else {
- RTE_LOG(ERR, EAL, "Option %d is not supported "
- "on Windows\n", opt);
- }
- eal_usage(prgname);
- return -1;
- }
- }
-
- if (eal_adjust_config(&internal_config) != 0)
- return -1;
-
- /* sanity checks */
- if (eal_check_common_options(&internal_config) != 0) {
- eal_usage(prgname);
- return -1;
- }
-
- if (optind >= 0)
- argv[optind - 1] = prgname;
- ret = optind - 1;
- optind = 0; /* reset getopt lib */
- return ret;
-}
-
-static int
-sync_func(void *arg __rte_unused)
-{
- return 0;
-}
-
-static void
-rte_eal_init_alert(const char *msg)
-{
- fprintf(stderr, "EAL: FATAL: %s\n", msg);
- RTE_LOG(ERR, EAL, "%s\n", msg);
-}
-
- /* Launch threads, called at application init(). */
-int
-rte_eal_init(int argc, char **argv)
-{
- int i, fctret;
-
- eal_log_level_parse(argc, argv);
-
- /* create a map of all processors in the system */
- eal_create_cpu_map();
-
- if (rte_eal_cpu_init() < 0) {
- rte_eal_init_alert("Cannot detect lcores.");
- rte_errno = ENOTSUP;
- return -1;
- }
-
- fctret = eal_parse_args(argc, argv);
- if (fctret < 0)
- exit(1);
-
- eal_thread_init_master(rte_config.master_lcore);
-
- RTE_LCORE_FOREACH_SLAVE(i) {
-
- /*
- * create communication pipes between master thread
- * and children
- */
- if (_pipe(lcore_config[i].pipe_master2slave,
- sizeof(char), _O_BINARY) < 0)
- rte_panic("Cannot create pipe\n");
- if (_pipe(lcore_config[i].pipe_slave2master,
- sizeof(char), _O_BINARY) < 0)
- rte_panic("Cannot create pipe\n");
-
- lcore_config[i].state = WAIT;
-
- /* create a thread for each lcore */
- if (eal_thread_create(&lcore_config[i].thread_id) != 0)
- rte_panic("Cannot create thread\n");
- }
-
- /*
- * Launch a dummy function on all slave lcores, so that master lcore
- * knows they are all ready when this function returns.
- */
- rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
- rte_eal_mp_wait_lcore();
- return fctret;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#include <stdarg.h>
-#include <rte_log.h>
-#include <rte_debug.h>
-
- /* call abort(), it will generate a coredump if enabled */
-void
-__rte_panic(const char *funcname, const char *format, ...)
-{
- va_list ap;
-
- rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
- va_start(ap, format);
- rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
- va_end(ap);
- abort();
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#include <stdint.h>
-
-#include <rte_common.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-
-/* global data structure that contains the CPU map */
-static struct _wcpu_map {
- unsigned int total_procs;
- unsigned int proc_sockets;
- unsigned int proc_cores;
- unsigned int reserved;
- struct _win_lcore_map {
- uint8_t socket_id;
- uint8_t core_id;
- } wlcore_map[RTE_MAX_LCORE];
-} wcpu_map = { 0 };
-
-/*
- * Create a map of all processors and associated cores on the system
- */
-void
-eal_create_cpu_map()
-{
- wcpu_map.total_procs =
- GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
-
- LOGICAL_PROCESSOR_RELATIONSHIP lprocRel;
- DWORD lprocInfoSize = 0;
- BOOL ht_enabled = FALSE;
-
- /* First get the processor package information */
- lprocRel = RelationProcessorPackage;
- /* Determine the size of buffer we need (pass NULL) */
- GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
- wcpu_map.proc_sockets = lprocInfoSize / 48;
-
- lprocInfoSize = 0;
- /* Next get the processor core information */
- lprocRel = RelationProcessorCore;
- GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
- wcpu_map.proc_cores = lprocInfoSize / 48;
-
- if (wcpu_map.total_procs > wcpu_map.proc_cores)
- ht_enabled = TRUE;
-
- /* Distribute the socket and core ids appropriately
- * across the logical cores. For now, split the cores
- * equally across the sockets.
- */
- unsigned int lcore = 0;
- for (unsigned int socket = 0; socket <
- wcpu_map.proc_sockets; ++socket) {
- for (unsigned int core = 0;
- core < (wcpu_map.proc_cores / wcpu_map.proc_sockets);
- ++core) {
- wcpu_map.wlcore_map[lcore]
- .socket_id = socket;
- wcpu_map.wlcore_map[lcore]
- .core_id = core;
- lcore++;
- if (ht_enabled) {
- wcpu_map.wlcore_map[lcore]
- .socket_id = socket;
- wcpu_map.wlcore_map[lcore]
- .core_id = core;
- lcore++;
- }
- }
- }
-}
-
-/*
- * Check if a cpu is present by the presence of the cpu information for it
- */
-int
-eal_cpu_detected(unsigned int lcore_id)
-{
- return (lcore_id < wcpu_map.total_procs);
-}
-
-/*
- * Get CPU socket id for a logical core
- */
-unsigned
-eal_cpu_socket_id(unsigned int lcore_id)
-{
- return wcpu_map.wlcore_map[lcore_id].socket_id;
-}
-
-/*
- * Get CPU socket id (NUMA node) for a logical core
- */
-unsigned
-eal_cpu_core_id(unsigned int lcore_id)
-{
- return wcpu_map.wlcore_map[lcore_id].core_id;
-}
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#include <io.h>
-
-#include <rte_atomic.h>
-#include <rte_debug.h>
-#include <rte_launch.h>
-#include <rte_lcore.h>
-#include <rte_per_lcore.h>
-#include <rte_common.h>
-#include <rte_memory.h>
-#include <eal_thread.h>
-
-#include "eal_private.h"
-
-RTE_DEFINE_PER_LCORE(unsigned int, _lcore_id) = LCORE_ID_ANY;
-RTE_DEFINE_PER_LCORE(unsigned int, _socket_id) = (unsigned int)SOCKET_ID_ANY;
-RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
-
-/*
- * Send a message to a slave lcore identified by slave_id to call a
- * function f with argument arg. Once the execution is done, the
- * remote lcore switch in FINISHED state.
- */
-int
-rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned int slave_id)
-{
- int n;
- char c = 0;
- int m2s = lcore_config[slave_id].pipe_master2slave[1];
- int s2m = lcore_config[slave_id].pipe_slave2master[0];
-
- if (lcore_config[slave_id].state != WAIT)
- return -EBUSY;
-
- lcore_config[slave_id].f = f;
- lcore_config[slave_id].arg = arg;
-
- /* send message */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = _write(m2s, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- /* wait ack */
- do {
- n = _read(s2m, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- return 0;
-}
-
-void
-eal_thread_init_master(unsigned int lcore_id)
-{
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-}
-
-static inline pthread_t
-eal_thread_self(void)
-{
- return GetCurrentThreadId();
-}
-
-/* main loop of threads */
-void *
-eal_thread_loop(void *arg __rte_unused)
-{
- char c;
- int n, ret;
- unsigned int lcore_id;
- pthread_t thread_id;
- int m2s, s2m;
- char cpuset[RTE_CPU_AFFINITY_STR_LEN];
-
- thread_id = eal_thread_self();
-
- /* retrieve our lcore_id from the configuration structure */
- RTE_LCORE_FOREACH_SLAVE(lcore_id) {
- if (thread_id == lcore_config[lcore_id].thread_id)
- break;
- }
- if (lcore_id == RTE_MAX_LCORE)
- rte_panic("cannot retrieve lcore id\n");
-
- m2s = lcore_config[lcore_id].pipe_master2slave[0];
- s2m = lcore_config[lcore_id].pipe_slave2master[1];
-
- /* set the lcore ID in per-lcore memory area */
- RTE_PER_LCORE(_lcore_id) = lcore_id;
-
- RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s])\n",
- lcore_id, (uintptr_t)thread_id, cpuset);
-
- /* read on our pipe to get commands */
- while (1) {
- void *fct_arg;
-
- /* wait command */
- do {
- n = _read(m2s, &c, 1);
- } while (n < 0 && errno == EINTR);
-
- if (n <= 0)
- rte_panic("cannot read on configuration pipe\n");
-
- lcore_config[lcore_id].state = RUNNING;
-
- /* send ack */
- n = 0;
- while (n == 0 || (n < 0 && errno == EINTR))
- n = _write(s2m, &c, 1);
- if (n < 0)
- rte_panic("cannot write on configuration pipe\n");
-
- if (lcore_config[lcore_id].f == NULL)
- rte_panic("NULL function pointer\n");
-
- /* call the function and store the return value */
- fct_arg = lcore_config[lcore_id].arg;
- ret = lcore_config[lcore_id].f(fct_arg);
- lcore_config[lcore_id].ret = ret;
- rte_wmb();
-
- /* when a service core returns, it should go directly to WAIT
- * state, because the application will not lcore_wait() for it.
- */
- if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
- lcore_config[lcore_id].state = WAIT;
- else
- lcore_config[lcore_id].state = FINISHED;
- }
-}
-
-/* function to create threads */
-int
-eal_thread_create(pthread_t *thread)
-{
- HANDLE th;
-
- th = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)eal_thread_loop,
- NULL, 0, (LPDWORD)thread);
- if (!th)
- return -1;
-
- SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
- SetThreadPriority(th, THREAD_PRIORITY_TIME_CRITICAL);
-
- return 0;
-}
-
-int
-rte_thread_setname(__rte_unused pthread_t id, __rte_unused const char *name)
-{
- /* TODO */
- /* This is a stub, not the expected result */
- return 0;
-}
+++ /dev/null
-/* SPDX-License-Identifier: ISC AND BSD-2-Clause
- * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
- *
- * Sponsored in part by the Defense Advanced Research Projects
- * Agency (DARPA) and Air Force Research Laboratory, Air Force
- * Materiel Command, USAF, under agreement number F39502-99-1-0512.
- */
-/*
- * Copyright (c) 2000 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Dieter Baron and Thomas Klausner.
- */
-
-#include <getopt.h>
-
-#ifdef NEED_USUAL_GETOPT
-
-#include <string.h>
-#include <stdlib.h>
-
-const char *optarg; /* argument associated with option */
-int opterr = 1; /* if error message should be printed */
-int optind = 1; /* index into parent argv vector */
-int optopt = '?'; /* character checked for validity */
-
-static void pass(void) {}
-#define warnx(a, ...) pass()
-
-#define PRINT_ERROR ((opterr) && (*options != ':'))
-
-#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */
-#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */
-#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
-
-/* return values */
-#define BADCH ((int)'?')
-#define BADARG ((*options == ':') ? (int)':' : (int)'?')
-#define INORDER 1
-
-#define EMSG ""
-
-static const char *place = EMSG; /* option letter processing */
-
-/* XXX: set optreset to 1 rather than these two */
-static int nonopt_start = -1; /* first non option argument (for permute) */
-static int nonopt_end = -1; /* first option after non options (for permute) */
-
-/* Error messages */
-static const char recargchar[] = "option requires an argument -- %c";
-static const char recargstring[] = "option requires an argument -- %s";
-static const char ambig[] = "ambiguous option -- %.*s";
-static const char noarg[] = "option doesn't take an argument -- %.*s";
-static const char illoptchar[] = "unknown option -- %c";
-static const char illoptstring[] = "unknown option -- %s";
-
-/*
- * Compute the greatest common divisor of a and b.
- */
-static int
-gcd(int a, int b)
-{
- int c;
-
- c = a % b;
- while (c != 0) {
- a = b;
- b = c;
- c = a % b;
- }
-
- return (b);
-}
-
-/*
- * Exchange the block from nonopt_start to nonopt_end with the block
- * from nonopt_end to opt_end (keeping the same order of arguments
- * in each block).
- */
-static void
-permute_args(int panonopt_start, int panonopt_end, int opt_end,
- char **nargv)
-{
- int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
- char *swap;
-
- /*
- * compute lengths of blocks and number and size of cycles
- */
- nnonopts = panonopt_end - panonopt_start;
- nopts = opt_end - panonopt_end;
- ncycle = gcd(nnonopts, nopts);
- cyclelen = (opt_end - panonopt_start) / ncycle;
-
- for (i = 0; i < ncycle; i++) {
- cstart = panonopt_end+i;
- pos = cstart;
- for (j = 0; j < cyclelen; j++) {
- if (pos >= panonopt_end)
- pos -= nnonopts;
- else
- pos += nopts;
- swap = nargv[pos];
- /* LINTED const cast */
- ((char **) nargv)[pos] = nargv[cstart];
- /* LINTED const cast */
- ((char **)nargv)[cstart] = swap;
- }
- }
-}
-
-/*
- * parse_long_options --
- * Parse long options in argc/argv argument vector.
- * Returns -1 if short_too is set and the option does not match long_options.
- */
-static int
-parse_long_options(char **nargv, const char *options,
- const struct option *long_options, int *idx, int short_too)
-{
- const char *current_argv;
- char *has_equal;
- size_t current_argv_len;
- int i, match;
-
- current_argv = place;
- match = -1;
-
- optind++;
-
- has_equal = strchr(current_argv, '=');
- if (has_equal != NULL) {
- /* argument found (--option=arg) */
- current_argv_len = has_equal - current_argv;
- has_equal++;
- } else
- current_argv_len = strlen(current_argv);
-
- for (i = 0; long_options[i].name; i++) {
- /* find matching long option */
- if (strncmp(current_argv, long_options[i].name,
- current_argv_len))
- continue;
-
- if (strlen(long_options[i].name) == current_argv_len) {
- /* exact match */
- match = i;
- break;
- }
- /*
- * If this is a known short option, don't allow
- * a partial match of a single character.
- */
- if (short_too && current_argv_len == 1)
- continue;
-
- if (match == -1) /* partial match */
- match = i;
- else {
- /* ambiguous abbreviation */
- if (PRINT_ERROR)
- warnx(ambig, (int)current_argv_len,
- current_argv);
- optopt = 0;
- return BADCH;
- }
- }
- if (match != -1) { /* option found */
- if (long_options[match].has_arg == no_argument
- && has_equal) {
- if (PRINT_ERROR)
- warnx(noarg, (int)current_argv_len,
- current_argv);
- /*
- * XXX: GNU sets optopt to val regardless of flag
- */
- if (long_options[match].flag == NULL)
- optopt = long_options[match].val;
- else
- optopt = 0;
- return BADARG;
- }
- if (long_options[match].has_arg == required_argument ||
- long_options[match].has_arg == optional_argument) {
- if (has_equal)
- optarg = has_equal;
- else if (long_options[match].has_arg ==
- required_argument) {
- /*
- * optional argument doesn't use next nargv
- */
- optarg = nargv[optind++];
- }
- }
- if ((long_options[match].has_arg == required_argument)
- && (optarg == NULL)) {
- /*
- * Missing argument; leading ':' indicates no error
- * should be generated.
- */
- if (PRINT_ERROR)
- warnx(recargstring,
- current_argv);
- /*
- * XXX: GNU sets optopt to val regardless of flag
- */
- if (long_options[match].flag == NULL)
- optopt = long_options[match].val;
- else
- optopt = 0;
- --optind;
- return BADARG;
- }
- } else { /* unknown option */
- if (short_too) {
- --optind;
- return (-1);
- }
- if (PRINT_ERROR)
- warnx(illoptstring, current_argv);
- optopt = 0;
- return BADCH;
- }
- if (idx)
- *idx = match;
- if (long_options[match].flag) {
- *long_options[match].flag = long_options[match].val;
- return 0;
- } else
- return (long_options[match].val);
-}
-
-/*
- * getopt_internal --
- * Parse argc/argv argument vector. Called by user level routines.
- */
-static int
-getopt_internal(int nargc, char **nargv, const char *options,
- const struct option *long_options, int *idx, int flags)
-{
- char *oli; /* option letter list index */
- int optchar, short_too;
- static int posixly_correct = -1;
- char *buf;
- size_t len;
- int optreset = 0;
-
- if (options == NULL)
- return (-1);
-
- /*
- * Disable GNU extensions if POSIXLY_CORRECT is set or options
- * string begins with a '+'.
- */
- if (posixly_correct == -1)
- posixly_correct = _dupenv_s(&buf, &len, "POSIXLY_CORRECT");
- if (!posixly_correct || *options == '+')
- flags &= ~FLAG_PERMUTE;
- else if (*options == '-')
- flags |= FLAG_ALLARGS;
- if (*options == '+' || *options == '-')
- options++;
- if (!posixly_correct)
- free(buf);
- /*
- * reset if requested
- */
- if (optind == 0)
- optind = optreset = 1;
-
- optarg = NULL;
- if (optreset)
- nonopt_start = nonopt_end = -1;
-start:
- if (optreset || !*place) { /* update scanning pointer */
- optreset = 0;
- if (optind >= nargc) { /* end of argument vector */
- place = EMSG;
- if (nonopt_end != -1) {
- /* do permutation, if we have to */
- permute_args(nonopt_start, nonopt_end,
- optind, nargv);
- optind -= nonopt_end - nonopt_start;
- } else if (nonopt_start != -1) {
- /*
- * If we skipped non-options, set optind
- * to the first of them.
- */
- optind = nonopt_start;
- }
- nonopt_start = nonopt_end = -1;
- return (-1);
- }
- place = nargv[optind];
- if (*place != '-' ||
- (place[1] == '\0' && strchr(options, '-') == NULL)) {
- place = EMSG; /* found non-option */
- if (flags & FLAG_ALLARGS) {
- /*
- * GNU extension:
- * return non-option as argument to option 1
- */
- optarg = nargv[optind++];
- return INORDER;
- }
- if (!(flags & FLAG_PERMUTE)) {
- /*
- * If no permutation wanted, stop parsing
- * at first non-option.
- */
- return (-1);
- }
- /* do permutation */
- if (nonopt_start == -1)
- nonopt_start = optind;
- else if (nonopt_end != -1) {
- permute_args(nonopt_start, nonopt_end,
- optind, nargv);
- nonopt_start = optind -
- (nonopt_end - nonopt_start);
- nonopt_end = -1;
- }
- optind++;
- /* process next argument */
- goto start;
- }
- if (nonopt_start != -1 && nonopt_end == -1)
- nonopt_end = optind;
-
- /*
- * If we have "-" do nothing, if "--" we are done.
- */
- if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
- optind++;
- place = EMSG;
- /*
- * We found an option (--), so if we skipped
- * non-options, we have to permute.
- */
- if (nonopt_end != -1) {
- permute_args(nonopt_start, nonopt_end,
- optind, nargv);
- optind -= nonopt_end - nonopt_start;
- }
- nonopt_start = nonopt_end = -1;
- return (-1);
- }
- }
-
- /*
- * Check long options if:
- * 1) we were passed some
- * 2) the arg is not just "-"
- * 3) either the arg starts with -- we are getopt_long_only()
- */
- if (long_options != NULL && place != nargv[optind] &&
- (*place == '-' || (flags & FLAG_LONGONLY))) {
- short_too = 0;
- if (*place == '-')
- place++; /* --foo long option */
- else if (*place != ':' && strchr(options, *place) != NULL)
- short_too = 1; /* could be short option too */
-
- optchar = parse_long_options(nargv, options, long_options,
- idx, short_too);
- if (optchar != -1) {
- place = EMSG;
- return optchar;
- }
- }
-
- optchar = (int)*place++;
- oli = strchr(options, optchar);
- if (optchar == (int)':' ||
- (optchar == (int)'-' && *place != '\0') ||
- oli == NULL) {
- /*
- * If the user specified "-" and '-' isn't listed in
- * options, return -1 (non-option) as per POSIX.
- * Otherwise, it is an unknown option character (or ':').
- */
- if (optchar == (int)'-' && *place == '\0')
- return (-1);
- if (!*place)
- ++optind;
- if (PRINT_ERROR)
- warnx(illoptchar, optchar);
- optopt = optchar;
- return BADCH;
- }
- if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
- /* -W long-option */
- if (*place)
- ;
- else if (++optind >= nargc) { /* no arg */
- place = EMSG;
- if (PRINT_ERROR)
- warnx(recargchar, optchar);
- optopt = optchar;
- return BADARG;
- } /* white space */
- place = nargv[optind];
- optchar = parse_long_options(nargv, options, long_options,
- idx, 0);
- place = EMSG;
- return optchar;
- }
- if (*++oli != ':') { /* doesn't take argument */
- if (!*place)
- ++optind;
- } else { /* takes (optional) argument */
- optarg = NULL;
- if (*place) /* no white space */
- optarg = place;
- else if (oli[1] != ':') { /* arg not optional */
- if (++optind >= nargc) { /* no arg */
- place = EMSG;
- if (PRINT_ERROR)
- warnx(recargchar, optchar);
- optopt = optchar;
- return BADARG;
- }
- optarg = nargv[optind];
- }
- place = EMSG;
- ++optind;
- }
- /* dump back option letter */
- return optchar;
-}
-
-/*
- * getopt --
- * Parse argc/argv argument vector.
- */
-int
-getopt(int nargc, char *nargv[], const char *options)
-{
- return getopt_internal(nargc, nargv, options, NULL, NULL,
- FLAG_PERMUTE);
-}
-
-/*
- * getopt_long --
- * Parse argc/argv argument vector.
- */
-int
-getopt_long(int nargc, char *nargv[], const char *options,
- const struct option *long_options, int *idx)
-{
-
- return (getopt_internal(nargc, nargv, options, long_options, idx,
- FLAG_PERMUTE));
-}
-
-/*
- * getopt_long_only --
- * Parse argc/argv argument vector.
- */
-int
-getopt_long_only(int nargc, char *nargv[], const char *options,
- const struct option *long_options, int *idx)
-{
-
- return (getopt_internal(nargc, nargv, options, long_options, idx,
- FLAG_PERMUTE|FLAG_LONGONLY));
-}
-
-#endif /* NEED_USUAL_GETOPT */
+++ /dev/null
-/* SPDX-License-Identifier: MIT
- * Dirent interface for Microsoft Visual Studio
- * Version 1.21
- * Copyright (C) 2006-2012 Toni Ronkko
- * https://github.com/tronkko/dirent
- */
-
-#ifndef DIRENT_H
-#define DIRENT_H
-
-/*
- * Include windows.h without Windows Sockets 1.1 to prevent conflicts with
- * Windows Sockets 2.0.
- */
-#ifndef WIN32_LEAN_AND_MEAN
-# define WIN32_LEAN_AND_MEAN
-#endif
-
-#include <windows.h>
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <wchar.h>
-#include <string.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <errno.h>
-
-/* Maximum length of file name */
-#if !defined(PATH_MAX)
-# define PATH_MAX MAX_PATH
-#endif
-
-/* File type flags for d_type */
-#define DT_UNKNOWN 0
-#define DT_REG S_IFREG
-#define DT_DIR S_IFDIR
-#define DT_CHR S_IFCHR
-
-/*
- * File type macros. Note that block devices, sockets and links cannot be
- * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are
- * only defined for compatibility. These macros should always return false
- * on Windows.
- */
-#if !defined(S_ISDIR)
-# define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
-#endif
-#if !defined(S_ISREG)
-# define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
-#endif
-
-/* Wide-character version */
-struct _wdirent {
- /* Always zero */
- long d_ino;
-
- /* Structure size */
- unsigned short d_reclen;
-
- /* Length of name without \0 */
- size_t d_namlen;
-
- /* File type */
- int d_type;
-
- /* File name */
- wchar_t d_name[PATH_MAX];
-};
-typedef struct _wdirent _wdirent;
-
-struct _WDIR {
- /* Current directory entry */
- struct _wdirent ent;
-
- /* Private file data */
- WIN32_FIND_DATAW data;
-
- /* True if data is valid */
- int cached;
-
- /* Win32 search handle */
- HANDLE handle;
-
- /* Initial directory name */
- wchar_t *patt;
-};
-typedef struct _WDIR _WDIR;
-
-static _WDIR *_wopendir(const wchar_t *dirname);
-static int _wclosedir(_WDIR *dirp);
-
-/* For compatibility with Symbian */
-#define wdirent _wdirent
-#define WDIR _WDIR
-#define wopendir _wopendir
-#define wclosedir _wclosedir
-
-/* Multi-byte character versions */
-struct dirent {
- /* Always zero */
- long d_ino;
-
- /* Structure size */
- unsigned short d_reclen;
-
- /* Length of name without \0 */
- size_t d_namlen;
-
- /* File type */
- int d_type;
-
- /* File name */
- char d_name[PATH_MAX];
-};
-typedef struct dirent dirent;
-
-struct DIR {
- struct dirent ent;
- struct _WDIR *wdirp;
-};
-typedef struct DIR DIR;
-
-static DIR *opendir(const char *dirname);
-static struct dirent *readdir(DIR *dirp);
-static int closedir(DIR *dirp);
-
-/* Internal utility functions */
-static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp);
-static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp);
-
-static int dirent_mbstowcs_s(
- size_t *pReturnValue,
- wchar_t *wcstr,
- size_t sizeInWords,
- const char *mbstr,
- size_t count);
-
-static int dirent_wcstombs_s(
- size_t *pReturnValue,
- char *mbstr,
- size_t sizeInBytes,
- const wchar_t *wcstr,
- size_t count);
-
-static void dirent_set_errno(int error);
-
-/*
- * Open directory stream DIRNAME for read and return a pointer to the
- * internal working area that is used to retrieve individual directory
- * entries.
- */
-static _WDIR*
-_wopendir(const wchar_t *dirname)
-{
- _WDIR *dirp = NULL;
- int error;
-
- /* Must have directory name */
- if (dirname == NULL || dirname[0] == '\0') {
- dirent_set_errno(ENOENT);
- return NULL;
- }
-
- /* Allocate new _WDIR structure */
- dirp = (_WDIR *)malloc(sizeof(struct _WDIR));
- if (dirp != NULL) {
- DWORD n;
-
- /* Reset _WDIR structure */
- dirp->handle = INVALID_HANDLE_VALUE;
- dirp->patt = NULL;
- dirp->cached = 0;
-
- /* Compute the length of full path plus zero terminator
- *
- * Note that on WinRT there's no way to convert relative paths
- * into absolute paths, so just assume its an absolute path.
- */
- #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)
- n = wcslen(dirname);
- #else
- n = GetFullPathNameW(dirname, 0, NULL, NULL);
- #endif
-
- /* Allocate room for absolute directory name and search
- * pattern
- */
- dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16);
- if (dirp->patt) {
- /* Convert relative directory name to an
- * absolute one. This allows rewinddir() to
- * function correctly even when current working
- * directory is changed between opendir()
- * and rewinddir().
- *
- * Note that on WinRT there's no way to convert
- * relative paths into absolute paths, so just
- * assume its an absolute path.
- */
- #if defined(WINAPI_FAMILY) && \
- (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)
- wcsncpy_s(dirp->patt, n + 1, dirname, n);
- #else
- n = GetFullPathNameW(dirname, n, dirp->patt, NULL);
- #endif
- if (n > 0) {
- wchar_t *p;
-
- /* Append search pattern \* to the directory
- * name
- */
- p = dirp->patt + n;
- if (dirp->patt < p) {
- switch (p[-1]) {
- case '\\':
- case '/':
- case ':':
- /* Directory ends in path separator,
- * e.g.c:\temp\
- */
- /*NOP*/;
- break;
-
- default:
- /* Directory name doesn't end in path
- * separator
- */
- *p++ = '\\';
- }
- }
- *p++ = '*';
- *p = '\0';
-
- /* Open directory stream and retrieve the first
- * entry
- */
- if (dirent_first(dirp)) {
- /* Directory stream opened successfully */
- error = 0;
- } else {
- /* Cannot retrieve first entry */
- error = 1;
- dirent_set_errno(ENOENT);
- }
-
- } else {
- /* Cannot retrieve full path name */
- dirent_set_errno(ENOENT);
- error = 1;
- }
-
- } else {
- /* Cannot allocate memory for search pattern */
- error = 1;
- }
-
- } else {
- /* Cannot allocate _WDIR structure */
- error = 1;
- }
-
- /* Clean up in case of error */
- if (error && dirp) {
- _wclosedir(dirp);
- dirp = NULL;
- }
-
- return dirp;
-}
-
-/*
- * Close directory stream opened by opendir() function.
- * This invalidates the DIR structure as well as any directory
- * entry read previously by _wreaddir().
- */
-static int
-_wclosedir(_WDIR *dirp)
-{
- int ok;
- if (dirp) {
-
- /* Release search handle */
- if (dirp->handle != INVALID_HANDLE_VALUE) {
- FindClose(dirp->handle);
- dirp->handle = INVALID_HANDLE_VALUE;
- }
-
- /* Release search pattern */
- if (dirp->patt) {
- free(dirp->patt);
- dirp->patt = NULL;
- }
-
- /* Release directory structure */
- free(dirp);
- ok = /*success*/0;
-
- } else {
- /* Invalid directory stream */
- dirent_set_errno(EBADF);
- ok = /*failure*/-1;
- }
- return ok;
-}
-
-/* Get first directory entry (internal) */
-static WIN32_FIND_DATAW*
-dirent_first(_WDIR *dirp)
-{
- WIN32_FIND_DATAW *datap;
-
- /* Open directory and retrieve the first entry */
- dirp->handle = FindFirstFileExW(
- dirp->patt, FindExInfoStandard, &dirp->data,
- FindExSearchNameMatch, NULL, 0);
- if (dirp->handle != INVALID_HANDLE_VALUE) {
-
- /* a directory entry is now waiting in memory */
- datap = &dirp->data;
- dirp->cached = 1;
-
- } else {
-
- /* Failed to re-open directory: no directory entry in memory */
- dirp->cached = 0;
- datap = NULL;
-
- }
- return datap;
-}
-
-/* Get next directory entry (internal) */
-static WIN32_FIND_DATAW*
-dirent_next(_WDIR *dirp)
-{
- WIN32_FIND_DATAW *p;
-
- /* Get next directory entry */
- if (dirp->cached != 0) {
-
- /* A valid directory entry already in memory */
- p = &dirp->data;
- dirp->cached = 0;
-
- } else if (dirp->handle != INVALID_HANDLE_VALUE) {
-
- /* Get the next directory entry from stream */
- if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) {
- /* Got a file */
- p = &dirp->data;
- } else {
- /* The very last entry has been processed
- *or an error occurred
- */
- FindClose(dirp->handle);
- dirp->handle = INVALID_HANDLE_VALUE;
- p = NULL;
- }
-
- } else {
-
- /* End of directory stream reached */
- p = NULL;
-
- }
-
- return p;
-}
-
-/*
- * Open directory stream using plain old C-string.
- */
-static DIR*
-opendir(const char *dirname)
-{
- struct DIR *dirp;
- int error;
-
- /* Must have directory name */
- if (dirname == NULL || dirname[0] == '\0') {
- dirent_set_errno(ENOENT);
- return NULL;
- }
-
- /* Allocate memory for DIR structure */
- dirp = (DIR *)malloc(sizeof(struct DIR));
- if (dirp) {
- wchar_t wname[PATH_MAX];
- size_t n;
-
- /* Convert directory name to wide-character string */
- error = dirent_mbstowcs_s(&n, wname, PATH_MAX,
- dirname, PATH_MAX);
- if (!error) {
-
- /* Open directory stream using wide-character name */
- dirp->wdirp = _wopendir(wname);
- if (dirp->wdirp) {
- /* Directory stream opened */
- error = 0;
- } else {
- /* Failed to open directory stream */
- error = 1;
- }
-
- } else {
- /*
- * Cannot convert file name to wide-character string.
- * This occurs if the string contains invalid multi-byte
- * sequences or the output buffer is too small to
- * contain the resulting string.
- */
- error = 1;
- }
-
- } else {
- /* Cannot allocate DIR structure */
- error = 1;
- }
-
- /* Clean up in case of error */
- if (error && dirp) {
- free(dirp);
- dirp = NULL;
- }
-
- return dirp;
-}
-
-/*
- * Read next directory entry.
- *
- * When working with text consoles, please note that file names
- * returned by readdir() are represented in the default ANSI code
- * page while any output toconsole is typically formatted on another
- * code page. Thus, non-ASCII characters in file names will not usually
- * display correctly on console. The problem can be fixed in two ways:
- * (1) change the character set of console to 1252 using chcp utility
- * and use Lucida Console font, or (2) use _cprintf function when
- * writing to console. The _cprinf() will re-encode ANSI strings to the
- * console code page so many non-ASCII characters will display correctly.
- */
-static struct dirent*
-readdir(DIR *dirp)
-{
- WIN32_FIND_DATAW *datap;
- struct dirent *entp;
-
- /* Read next directory entry */
- datap = dirent_next(dirp->wdirp);
- if (datap) {
- size_t n;
- int error;
-
- /* Attempt to convert file name to multi-byte string */
- error = dirent_wcstombs_s(&n, dirp->ent.d_name,
- PATH_MAX, datap->cFileName, PATH_MAX);
-
- /*
- * If the file name cannot be represented by a multi-byte
- * string, then attempt to use old 8+3 file name.
- * This allows traditional Unix-code to access some file
- * names despite of unicode characters, although file names
- * may seem unfamiliar to the user.
- *
- * Be ware that the code below cannot come up with a short
- * file name unless the file system provides one. At least
- * VirtualBox shared folders fail to do this.
- */
- if (error && datap->cAlternateFileName[0] != '\0') {
- error = dirent_wcstombs_s(
- &n, dirp->ent.d_name, PATH_MAX,
- datap->cAlternateFileName, PATH_MAX);
- }
-
- if (!error) {
- DWORD attr;
-
- /* Initialize directory entry for return */
- entp = &dirp->ent;
-
- /* Length of file name excluding zero terminator */
- entp->d_namlen = n - 1;
-
- /* File attributes */
- attr = datap->dwFileAttributes;
- if ((attr & FILE_ATTRIBUTE_DEVICE) != 0)
- entp->d_type = DT_CHR;
- else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0)
- entp->d_type = DT_DIR;
- else
- entp->d_type = DT_REG;
-
- /* Reset dummy fields */
- entp->d_ino = 0;
- entp->d_reclen = sizeof(struct dirent);
-
- } else {
- /*
- * Cannot convert file name to multi-byte string so
- * construct an erroneous directory entry and return
- * that. Note that we cannot return NULL as that would
- * stop the processing of directory entries completely.
- */
- entp = &dirp->ent;
- entp->d_name[0] = '?';
- entp->d_name[1] = '\0';
- entp->d_namlen = 1;
- entp->d_type = DT_UNKNOWN;
- entp->d_ino = 0;
- entp->d_reclen = 0;
- }
-
- } else {
- /* No more directory entries */
- entp = NULL;
- }
-
- return entp;
-}
-
-/*
- * Close directory stream.
- */
-static int
-closedir(DIR *dirp)
-{
- int ok;
- if (dirp) {
-
- /* Close wide-character directory stream */
- ok = _wclosedir(dirp->wdirp);
- dirp->wdirp = NULL;
-
- /* Release multi-byte character version */
- free(dirp);
-
- } else {
-
- /* Invalid directory stream */
- dirent_set_errno(EBADF);
- ok = /*failure*/-1;
-
- }
- return ok;
-}
-
-/* Convert multi-byte string to wide character string */
-static int
-dirent_mbstowcs_s(
- size_t *pReturnValue,
- wchar_t *wcstr,
- size_t sizeInWords,
- const char *mbstr,
- size_t count)
-{
- int error;
-
- #if defined(_MSC_VER) && _MSC_VER >= 1400
- /* Microsoft Visual Studio 2005 or later */
- error = mbstowcs_s(pReturnValue, wcstr,
- sizeInWords, mbstr, count);
- #else
-
- /* Older Visual Studio or non-Microsoft compiler */
- size_t n;
-
- /* Convert to wide-character string (or count characters) */
- n = mbstowcs(wcstr, mbstr, sizeInWords);
- if (!wcstr || n < count) {
-
- /* Zero-terminate output buffer */
- if (wcstr && sizeInWords) {
- if (n >= sizeInWords)
- n = sizeInWords - 1;
- wcstr[n] = 0;
- }
-
- /* Length of resuting multi-byte string WITH zero
- *terminator
- */
- if (pReturnValue)
- *pReturnValue = n + 1;
-
- /* Success */
- error = 0;
-
- } else {
-
- /* Could not convert string */
- error = 1;
-
- }
- #endif
-
- return error;
-}
-
-/* Convert wide-character string to multi-byte string */
-static int
-dirent_wcstombs_s(
- size_t *pReturnValue,
- char *mbstr,
- size_t sizeInBytes, /* max size of mbstr */
- const wchar_t *wcstr,
- size_t count)
-{
- int error;
-
- #if defined(_MSC_VER) && _MSC_VER >= 1400
- /* Microsoft Visual Studio 2005 or later */
- error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count);
- #else
- /* Older Visual Studio or non-Microsoft compiler */
- size_t n;
-
- /* Convert to multi-byte string
- * (or count the number of bytes needed)
- */
- n = wcstombs(mbstr, wcstr, sizeInBytes);
- if (!mbstr || n < count) {
- /* Zero-terminate output buffer */
- if (mbstr && sizeInBytes) {
- if (n >= sizeInBytes)
- n = sizeInBytes - 1;
- mbstr[n] = '\0';
- }
- /* Length of resulting multi-bytes string WITH
- *zero-terminator
- */
- if (pReturnValue)
- *pReturnValue = n + 1;
- /* Success */
- error = 0;
- } else {
- /* Cannot convert string */
- error = 1;
- }
- #endif
-
- return error;
-}
-
-/* Set errno variable */
-static void
-dirent_set_errno(int error)
-{
-#if defined(_MSC_VER) && _MSC_VER >= 1400
- /* Microsoft Visual Studio 2005 and later */
- _set_errno(error);
-#else
-
- /* Non-Microsoft compiler or older Microsoft compiler */
- errno = error;
-#endif
-}
-
-#ifdef __cplusplus
-}
-#endif
-#endif /*DIRENT_H*/
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#ifndef _FNMATCH_H_
-#define _FNMATCH_H_
-
-/**
- * This file is required to support the common code in eal_common_log.c
- * as Microsoft libc does not contain fnmatch.h. This may be removed in
- * future releases.
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FNM_NOMATCH 1
-
-/**
- * This function is used for searhing a given string source
- * with the given regular expression pattern.
- *
- * @param pattern
- * regular expression notation decribing the pattern to match
- *
- * @param string
- * source string to searcg for the pattern
- *
- * @param flag
- * containing information about the pattern
- *
- * @return
- * if the pattern is found then return 0 or else FNM_NOMATCH
- */
-static inline int fnmatch(__rte_unused const char *pattern,
- __rte_unused const char *string,
- __rte_unused int flags)
-{
- /* TODO */
- /* This is a stub, not the expected result */
- return FNM_NOMATCH;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _FNMATCH_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-2-Clause
- * Copyright (c) 2000 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Dieter Baron and Thomas Klausner.
- */
-
-/**
- * @file
- * getopt compat.
- *
- * This module provides getopt() and getopt_long().
- */
-
-#ifndef _USUAL_GETOPT_H_
-#define _USUAL_GETOPT_H_
-
-#ifndef NEED_USUAL_GETOPT
-#if !defined(HAVE_GETOPT_H) || !defined(HAVE_GETOPT) || \
- !defined(HAVE_GETOPT_LONG)
-#define NEED_USUAL_GETOPT
-#endif
-#endif
-
-#ifndef NEED_USUAL_GETOPT
-
-/* Use system getopt */
-#ifdef RTE_TOOLCHAIN_GCC
-#include_next <getopt.h>
-#else
-#include <getopt.h>
-#endif
-
-#else /* NEED_USUAL_GETOPT */
-
-/* avoid name collision */
-#define optarg usual_optarg
-#define opterr usual_opterr
-#define optind usual_optind
-#define optopt usual_optopt
-#define getopt(a, b, c) usual_getopt(a, b, c)
-#define getopt_long(a, b, c, d, e) usual_getopt_long(a, b, c, d, e)
-
-
-/** argument to current option, or NULL if it has none */
-extern const char *optarg;
-/** Current position in arg string. Starts from 1.
- * Setting to 0 resets state.
- */
-extern int optind;
-/** whether getopt() should print error messages on problems. Default: 1. */
-extern int opterr;
-/** Option char which caused error */
-extern int optopt;
-
-/** long option takes no argument */
-#define no_argument 0
-/** long option requires argument */
-#define required_argument 1
-/** long option has optional argument */
-#define optional_argument 2
-
-/** Long option description */
-struct option {
- /** name of long option */
- const char *name;
-
- /**
- * whether option takes an argument.
- * One of no_argument, required_argument, and optional_argument.
- */
- int has_arg;
-
- /** if not NULL, set *flag to val when option found */
- int *flag;
-
- /** if flag not NULL, value to set *flag to; else return value */
- int val;
-};
-
-/** Compat: getopt */
-int getopt(int argc, char *argv[], const char *options);
-
-/** Compat: getopt_long */
-int getopt_long(int argc, char *argv[], const char *options,
- const struct option *longopts, int *longindex);
-
-/** Compat: getopt_long_only */
-int getopt_long_only(int nargc, char *argv[], const char *options,
- const struct option *long_options, int *idx);
-
-
-#endif /* NEED_USUAL_GETOPT */
-
-#endif /* !_USUAL_GETOPT_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#ifndef _PTHREAD_H_
-#define _PTHREAD_H_
-
-/**
- * This file is required to support the common code in eal_common_proc.c,
- * eal_common_thread.c and common\include\rte_per_lcore.h as Microsoft libc
- * does not contain pthread.h. This may be removed in future releases.
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <windows.h>
-
-#define PTHREAD_BARRIER_SERIAL_THREAD TRUE
-
-/* defining pthread_t type on Windows since there is no in Microsoft libc*/
-typedef uintptr_t pthread_t;
-
-/* defining pthread_attr_t type on Windows since there is no in Microsoft libc*/
-typedef void *pthread_attr_t;
-
-typedef SYNCHRONIZATION_BARRIER pthread_barrier_t;
-
-#define pthread_barrier_init(barrier, attr, count) \
- InitializeSynchronizationBarrier(barrier, count, -1)
-#define pthread_barrier_wait(barrier) EnterSynchronizationBarrier(barrier, \
- SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY)
-#define pthread_barrier_destroy(barrier) \
- DeleteSynchronizationBarrier(barrier)
-#define pthread_cancel(thread) TerminateThread((HANDLE) thread, 0)
-
-/* pthread function overrides */
-#define pthread_self() \
- ((pthread_t)GetCurrentThreadId())
-#define pthread_setaffinity_np(thread, size, cpuset) \
- eal_set_thread_affinity_mask(thread, (unsigned long *) cpuset)
-#define pthread_getaffinity_np(thread, size, cpuset) \
- eal_get_thread_affinity_mask(thread, (unsigned long *) cpuset)
-#define pthread_create(threadid, threadattr, threadfunc, args) \
- eal_create_thread(threadid, threadfunc, args)
-
-static inline int
-eal_set_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset)
-{
- SetThreadAffinityMask((HANDLE) threadid, *cpuset);
- return 0;
-}
-
-static inline int
-eal_get_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset)
-{
- /* Workaround for the lack of a GetThreadAffinityMask()
- *API in Windows
- */
- /* obtain previous mask by setting dummy mask */
- DWORD dwprevaffinitymask =
- SetThreadAffinityMask((HANDLE) threadid, 0x1);
- /* set it back! */
- SetThreadAffinityMask((HANDLE) threadid, dwprevaffinitymask);
- *cpuset = dwprevaffinitymask;
- return 0;
-}
-
-static inline int
-eal_create_thread(void *threadid, void *threadfunc, void *args)
-{
- HANDLE hThread;
- hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)threadfunc,
- args, 0, (LPDWORD)threadid);
- if (hThread) {
- SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
- SetThreadPriority(hThread, THREAD_PRIORITY_TIME_CRITICAL);
- }
- return ((hThread != NULL) ? 0 : E_FAIL);
-}
-
-static inline int
-pthread_join(pthread_t thread __attribute__((__unused__)),
- void **value_ptr __attribute__((__unused__)))
-{
- return 0;
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _PTHREAD_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#ifndef _REGEX_H_
-#define _REGEX_H_
-
-/**
- * This file is required to support the common code in eal_common_log.c
- * as Microsoft libc does not contain regex.h. This may be removed in
- * future releases.
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define REG_NOMATCH 1
-#define REG_ESPACE 12
-
-#include <rte_common.h>
-
-/* defining regex_t for Windows */
-typedef void *regex_t;
-/* defining regmatch_t for Windows */
-typedef void *regmatch_t;
-
-/**
- * The regcomp() function will compile the regular expression
- * contained in the string pointed to by the pattern argument
- * and place the results in the structure pointed to by preg.
- * The cflags argument is the bitwise inclusive OR of zero or
- * more of the flags
- */
-static inline int regcomp(__rte_unused regex_t *preg,
- __rte_unused const char *regex, __rte_unused int cflags)
-{
- /* TODO */
- /* This is a stub, not the expected result */
- return REG_ESPACE;
-}
-
-/**
- * The regexec() function compares the null-terminated string
- * specified by string with the compiled regular expression
- * preg initialised by a previous call to regcomp(). If it finds
- * a match, regexec() returns 0; otherwise it returns non-zero
- * indicating either no match or an error. The eflags argument
- * is the bitwise inclusive OR of zero or more of the flags.
- */
-static inline int regexec(__rte_unused const regex_t *preg,
- __rte_unused const char *string, __rte_unused size_t nmatch,
- __rte_unused regmatch_t pmatch[], __rte_unused int eflags)
-{
- /* TODO */
- /* This is a stub, not the expected result */
- return REG_NOMATCH;
-}
-
-/**
- * The regerror() function provides a mapping from error codes
- * returned by regcomp() and regexec() to unspecified printable strings.
- */
-static inline size_t regerror(__rte_unused int errcode,
- __rte_unused const regex_t *preg, char *errbuf,
- __rte_unused size_t errbuf_size)
-{
- /* TODO */
- /* This is a stub, not the expected result */
- if (errbuf) {
- *errbuf = '\0';
- return 1;
- }
- return 0;
-}
-
-/**
- * The regfree() function frees any memory allocated by regcomp()
- * associated with preg.
- */
-static inline void regfree(__rte_unused regex_t *preg)
-{
- /* TODO */
- /* This is a stub, not the expected result */
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _REGEX_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2019 Intel Corporation
- */
-
-#ifndef _RTE_OS_H_
-#define _RTE_OS_H_
-
-/**
- * This is header should contain any function/macro definition
- * which are not supported natively or named differently in the
- * Windows OS. Functions will be added in future releases.
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <windows.h>
-#include <basetsd.h>
-#include <pthread.h>
-#include <stdio.h>
-
-/* limits.h replacement */
-#include <stdlib.h>
-#ifndef PATH_MAX
-#define PATH_MAX _MAX_PATH
-#endif
-
-#define strerror_r(a, b, c) strerror_s(b, c, a)
-
-/* strdup is deprecated in Microsoft libc and _strdup is preferred */
-#define strdup(str) _strdup(str)
-
-typedef SSIZE_T ssize_t;
-
-#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)
-
-#define index(a, b) strchr(a, b)
-#define rindex(a, b) strrchr(a, b)
-
-#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count)
-
-/**
- * Create a thread.
- * This function is private to EAL.
- *
- * @param thread
- * The location to store the thread id if successful.
- * @return
- * 0 for success, -1 if the thread is not created.
- */
-int eal_thread_create(pthread_t *thread);
-
-/**
- * Create a map of processors and cores on the system.
- * This function is private to EAL.
- */
-void eal_create_cpu_map(void);
-
-#ifndef RTE_TOOLCHAIN_GCC
-static inline int
-asprintf(char **buffer, const char *format, ...)
-{
- int size, ret;
- va_list arg;
-
- va_start(arg, format);
- size = vsnprintf(NULL, 0, format, arg);
- va_end(arg);
- if (size < 0)
- return -1;
- size++;
-
- *buffer = malloc(size);
- if (*buffer == NULL)
- return -1;
-
- va_start(arg, format);
- ret = vsnprintf(*buffer, size, format, arg);
- va_end(arg);
- if (ret != size - 1) {
- free(*buffer);
- return -1;
- }
- return ret;
-}
-#endif /* RTE_TOOLCHAIN_GCC */
-
-/* cpu_set macros implementation */
-#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
-#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
-#define RTE_CPU_FILL(set) CPU_FILL(set)
-#define RTE_CPU_NOT(dst, src) CPU_NOT(dst, src)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _RTE_OS_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#ifndef _SCHED_H_
-#define _SCHED_H_
-
-/**
- * This file is added to support the common code in eal_common_thread.c
- * as Microsoft libc does not contain sched.h. This may be removed
- * in future releases.
- */
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef CPU_SETSIZE
-#define CPU_SETSIZE RTE_MAX_LCORE
-#endif
-
-#define _BITS_PER_SET (sizeof(long long) * 8)
-#define _BIT_SET_MASK (_BITS_PER_SET - 1)
-
-#define _NUM_SETS(b) (((b) + _BIT_SET_MASK) / _BITS_PER_SET)
-#define _WHICH_SET(b) ((b) / _BITS_PER_SET)
-#define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1))
-
-typedef struct _rte_cpuset_s {
- long long _bits[_NUM_SETS(CPU_SETSIZE)];
-} rte_cpuset_t;
-
-#define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b)))
-
-#define CPU_ZERO(s) \
- do { \
- unsigned int _i; \
- \
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
- (s)->_bits[_i] = 0LL; \
- } while (0)
-
-#define CPU_ISSET(b, s) (((s)->_bits[_WHICH_SET(b)] & \
- (1LL << _WHICH_BIT(b))) != 0LL)
-
-static inline int
-count_cpu(rte_cpuset_t *s)
-{
- unsigned int _i;
- int count = 0;
-
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++)
- if (CPU_ISSET(_i, s) != 0LL)
- count++;
- return count;
-}
-#define CPU_COUNT(s) count_cpu(s)
-
-#define CPU_AND(dst, src1, src2) \
-do { \
- unsigned int _i; \
- \
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
- (dst)->_bits[_i] = (src1)->_bits[_i] & (src2)->_bits[_i]; \
-} while (0)
-
-#define CPU_OR(dst, src1, src2) \
-do { \
- unsigned int _i; \
- \
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
- (dst)->_bits[_i] = (src1)->_bits[_i] | (src2)->_bits[_i]; \
-} while (0)
-
-#define CPU_FILL(s) \
-do { \
- unsigned int _i; \
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
- (s)->_bits[_i] = -1LL; \
-} while (0)
-
-#define CPU_NOT(dst, src) \
-do { \
- unsigned int _i; \
- for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
- (dst)->_bits[_i] = (src)->_bits[_i] ^ -1LL; \
-} while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SCHED_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- *
- * Copyright (c) 1991, 1993
- * The Regents of the University of California. All rights reserved.
- */
-
-#ifndef _SYS_QUEUE_H_
-#define _SYS_QUEUE_H_
-
-/*
- * This file defines tail queues.
- *
- * A tail queue is headed by a pair of pointers, one to the head of the
- * list and the other to the tail of the list. The elements are doubly
- * linked so that an arbitrary element can be removed without a need to
- * traverse the list. New elements can be added to the list before or
- * after an existing element, at the head of the list, or at the end of
- * the list. A tail queue may be traversed in either direction.
- *
- * Below is a summary of implemented functions where:
- * + means the macro is available
- * - means the macro is not available
- * s means the macro is available but is slow (runs in O(n) time)
- *
- * TAILQ
- * _HEAD +
- * _CLASS_HEAD +
- * _HEAD_INITIALIZER +
- * _ENTRY +
- * _CLASS_ENTRY +
- * _INIT +
- * _EMPTY +
- * _FIRST +
- * _NEXT +
- * _PREV +
- * _LAST +
- * _LAST_FAST +
- * _FOREACH +
- * _FOREACH_FROM +
- * _FOREACH_SAFE +
- * _FOREACH_FROM_SAFE +
- * _FOREACH_REVERSE +
- * _FOREACH_REVERSE_FROM +
- * _FOREACH_REVERSE_SAFE +
- * _FOREACH_REVERSE_FROM_SAFE +
- * _INSERT_HEAD +
- * _INSERT_BEFORE +
- * _INSERT_AFTER +
- * _INSERT_TAIL +
- * _CONCAT +
- * _REMOVE_AFTER -
- * _REMOVE_HEAD -
- * _REMOVE +
- * _SWAP +
- *
- */
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * List definitions.
- */
-#define LIST_HEAD(name, type) \
-struct name { \
- struct type *lh_first; /* first element */ \
-}
-
-#define QMD_TRACE_ELEM(elem)
-#define QMD_TRACE_HEAD(head)
-#define TRACEBUF
-#define TRACEBUF_INITIALIZER
-
-#define TRASHIT(x)
-#define QMD_IS_TRASHED(x) 0
-
-#define QMD_SAVELINK(name, link)
-
-#ifdef __cplusplus
-/*
- * In C++ there can be structure lists and class lists:
- */
-#define QUEUE_TYPEOF(type) type
-#else
-#define QUEUE_TYPEOF(type) struct type
-#endif
-
-/*
- * Tail queue declarations.
- */
-#define TAILQ_HEAD(name, type) \
-struct name { \
- struct type *tqh_first; /* first element */ \
- struct type **tqh_last; /* addr of last next element */ \
- TRACEBUF \
-}
-
-#define TAILQ_CLASS_HEAD(name, type) \
-struct name { \
- class type *tqh_first; /* first element */ \
- class type **tqh_last; /* addr of last next element */ \
- TRACEBUF \
-}
-
-#define TAILQ_HEAD_INITIALIZER(head) \
- { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER }
-
-#define TAILQ_ENTRY(type) \
-struct { \
- struct type *tqe_next; /* next element */ \
- struct type **tqe_prev; /* address of previous next element */ \
- TRACEBUF \
-}
-
-#define TAILQ_CLASS_ENTRY(type) \
-struct { \
- class type *tqe_next; /* next element */ \
- class type **tqe_prev; /* address of previous next element */ \
- TRACEBUF \
-}
-
-/*
- * Tail queue functions.
- */
-#define QMD_TAILQ_CHECK_HEAD(head, field)
-#define QMD_TAILQ_CHECK_TAIL(head, headname)
-#define QMD_TAILQ_CHECK_NEXT(elm, field)
-#define QMD_TAILQ_CHECK_PREV(elm, field)
-
-#define TAILQ_CONCAT(head1, head2, field) do { \
- if (!TAILQ_EMPTY(head2)) { \
- *(head1)->tqh_last = (head2)->tqh_first; \
- (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
- (head1)->tqh_last = (head2)->tqh_last; \
- TAILQ_INIT((head2)); \
- QMD_TRACE_HEAD(head1); \
- QMD_TRACE_HEAD(head2); \
- } \
-} while (0)
-
-#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
-
-#define TAILQ_FIRST(head) ((head)->tqh_first)
-
-#define TAILQ_FOREACH(var, head, field) \
- for ((var) = TAILQ_FIRST((head)); \
- (var); \
- (var) = TAILQ_NEXT((var), field))
-
-#define TAILQ_FOREACH_FROM(var, head, field) \
- for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \
- (var); \
- (var) = TAILQ_NEXT((var), field))
-
-#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \
- for ((var) = TAILQ_FIRST((head)); \
- (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \
- (var) = (tvar))
-
-#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \
- for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \
- (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \
- (var) = (tvar))
-
-#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
- for ((var) = TAILQ_LAST((head), headname); \
- (var); \
- (var) = TAILQ_PREV((var), headname, field))
-
-#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \
- for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \
- (var); \
- (var) = TAILQ_PREV((var), headname, field))
-
-#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \
- for ((var) = TAILQ_LAST((head), headname); \
- (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \
- (var) = (tvar))
-
-#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \
- for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \
- (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \
- (var) = (tvar))
-
-#define TAILQ_INIT(head) do { \
- TAILQ_FIRST((head)) = NULL; \
- (head)->tqh_last = &TAILQ_FIRST((head)); \
- QMD_TRACE_HEAD(head); \
-} while (0)
-
-#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
- QMD_TAILQ_CHECK_NEXT(listelm, field); \
- TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field); \
- if (TAILQ_NEXT((listelm), field) != NULL) \
- TAILQ_NEXT((elm), field)->field.tqe_prev = \
- &TAILQ_NEXT((elm), field); \
- else { \
- (head)->tqh_last = &TAILQ_NEXT((elm), field); \
- QMD_TRACE_HEAD(head); \
- } \
- TAILQ_NEXT((listelm), field) = (elm); \
- (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
- QMD_TRACE_ELEM(&(elm)->field); \
- QMD_TRACE_ELEM(&(listelm)->field); \
-} while (0)
-
-#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
- QMD_TAILQ_CHECK_PREV(listelm, field); \
- (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
- TAILQ_NEXT((elm), field) = (listelm); \
- *(listelm)->field.tqe_prev = (elm); \
- (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
- QMD_TRACE_ELEM(&(elm)->field); \
- QMD_TRACE_ELEM(&(listelm)->field); \
-} while (0)
-
-#define TAILQ_INSERT_HEAD(head, elm, field) do { \
- QMD_TAILQ_CHECK_HEAD(head, field); \
- TAILQ_NEXT((elm), field) = TAILQ_FIRST((head)); \
- if (TAILQ_FIRST((head)) != NULL) \
- TAILQ_FIRST((head))->field.tqe_prev = \
- &TAILQ_NEXT((elm), field); \
- else \
- (head)->tqh_last = &TAILQ_NEXT((elm), field); \
- TAILQ_FIRST((head)) = (elm); \
- (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
- QMD_TRACE_HEAD(head); \
- QMD_TRACE_ELEM(&(elm)->field); \
-} while (0)
-
-#define TAILQ_INSERT_TAIL(head, elm, field) do { \
- QMD_TAILQ_CHECK_TAIL(head, field); \
- TAILQ_NEXT((elm), field) = NULL; \
- (elm)->field.tqe_prev = (head)->tqh_last; \
- *(head)->tqh_last = (elm); \
- (head)->tqh_last = &TAILQ_NEXT((elm), field); \
- QMD_TRACE_HEAD(head); \
- QMD_TRACE_ELEM(&(elm)->field); \
-} while (0)
-
-#define TAILQ_LAST(head, headname) \
- (*(((struct headname *)((head)->tqh_last))->tqh_last))
-
-/*
- * The FAST function is fast in that it causes no data access other
- * then the access to the head. The standard LAST function above
- * will cause a data access of both the element you want and
- * the previous element. FAST is very useful for instances when
- * you may want to prefetch the last data element.
- */
-#define TAILQ_LAST_FAST(head, type, field) \
- (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, \
- QUEUE_TYPEOF(type), field.tqe_next))
-
-#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
-
-#define TAILQ_PREV(elm, headname, field) \
- (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
-
-#define TAILQ_REMOVE(head, elm, field) do { \
- QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \
- QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \
- QMD_TAILQ_CHECK_NEXT(elm, field); \
- QMD_TAILQ_CHECK_PREV(elm, field); \
- if ((TAILQ_NEXT((elm), field)) != NULL) \
- TAILQ_NEXT((elm), field)->field.tqe_prev = \
- (elm)->field.tqe_prev; \
- else { \
- (head)->tqh_last = (elm)->field.tqe_prev; \
- QMD_TRACE_HEAD(head); \
- } \
- *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
- TRASHIT(*oldnext); \
- TRASHIT(*oldprev); \
- QMD_TRACE_ELEM(&(elm)->field); \
-} while (0)
-
-#define TAILQ_SWAP(head1, head2, type, field) do { \
- QUEUE_TYPEOF(type) * swap_first = (head1)->tqh_first; \
- QUEUE_TYPEOF(type) * *swap_last = (head1)->tqh_last; \
- (head1)->tqh_first = (head2)->tqh_first; \
- (head1)->tqh_last = (head2)->tqh_last; \
- (head2)->tqh_first = swap_first; \
- (head2)->tqh_last = swap_last; \
- swap_first = (head1)->tqh_first; \
- if (swap_first != NULL) \
- swap_first->field.tqe_prev = &(head1)->tqh_first; \
- else \
- (head1)->tqh_last = &(head1)->tqh_first; \
- swap_first = (head2)->tqh_first; \
- if (swap_first != NULL) \
- swap_first->field.tqe_prev = &(head2)->tqh_first; \
- else \
- (head2)->tqh_last = &(head2)->tqh_first; \
-} while (0)
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _SYS_QUEUE_H_ */
+++ /dev/null
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2019 Intel Corporation
- */
-
-#ifndef _UNISTD_H_
-#define _UNISTD_H_
-/**
- * This file is added to support common code in eal_common_lcore.c
- * as Microsoft libc does not contain unistd.h. This may be removed
- * in future releases.
- */
-#endif /* _UNISTD_H_ */
+++ /dev/null
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2019 Intel Corporation
-
-eal_inc += include_directories('include')
-
-env_objs = []
-env_headers = files(
- 'include/rte_os.h',
-)
-env_sources = files('eal.c',
- 'eal_debug.c',
- 'eal_lcore.c',
- 'eal_thread.c',
- 'getopt.c',
-)
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <stdarg.h>
+#include <rte_log.h>
+#include <rte_debug.h>
+
+ /* call abort(), it will generate a coredump if enabled */
+void
+__rte_panic(const char *funcname, const char *format, ...)
+{
+ va_list ap;
+
+ rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
+ va_start(ap, format);
+ rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+ va_end(ap);
+ abort();
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <stdint.h>
+
+#include <rte_common.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+/* global data structure that contains the CPU map */
+static struct _wcpu_map {
+ unsigned int total_procs;
+ unsigned int proc_sockets;
+ unsigned int proc_cores;
+ unsigned int reserved;
+ struct _win_lcore_map {
+ uint8_t socket_id;
+ uint8_t core_id;
+ } wlcore_map[RTE_MAX_LCORE];
+} wcpu_map = { 0 };
+
+/*
+ * Create a map of all processors and associated cores on the system
+ */
+void
+eal_create_cpu_map()
+{
+ wcpu_map.total_procs =
+ GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+
+ LOGICAL_PROCESSOR_RELATIONSHIP lprocRel;
+ DWORD lprocInfoSize = 0;
+ BOOL ht_enabled = FALSE;
+
+ /* First get the processor package information */
+ lprocRel = RelationProcessorPackage;
+ /* Determine the size of buffer we need (pass NULL) */
+ GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
+ wcpu_map.proc_sockets = lprocInfoSize / 48;
+
+ lprocInfoSize = 0;
+ /* Next get the processor core information */
+ lprocRel = RelationProcessorCore;
+ GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
+ wcpu_map.proc_cores = lprocInfoSize / 48;
+
+ if (wcpu_map.total_procs > wcpu_map.proc_cores)
+ ht_enabled = TRUE;
+
+ /* Distribute the socket and core ids appropriately
+ * across the logical cores. For now, split the cores
+ * equally across the sockets.
+ */
+ unsigned int lcore = 0;
+ for (unsigned int socket = 0; socket <
+ wcpu_map.proc_sockets; ++socket) {
+ for (unsigned int core = 0;
+ core < (wcpu_map.proc_cores / wcpu_map.proc_sockets);
+ ++core) {
+ wcpu_map.wlcore_map[lcore]
+ .socket_id = socket;
+ wcpu_map.wlcore_map[lcore]
+ .core_id = core;
+ lcore++;
+ if (ht_enabled) {
+ wcpu_map.wlcore_map[lcore]
+ .socket_id = socket;
+ wcpu_map.wlcore_map[lcore]
+ .core_id = core;
+ lcore++;
+ }
+ }
+ }
+}
+
+/*
+ * Check if a cpu is present by the presence of the cpu information for it
+ */
+int
+eal_cpu_detected(unsigned int lcore_id)
+{
+ return (lcore_id < wcpu_map.total_procs);
+}
+
+/*
+ * Get CPU socket id for a logical core
+ */
+unsigned
+eal_cpu_socket_id(unsigned int lcore_id)
+{
+ return wcpu_map.wlcore_map[lcore_id].socket_id;
+}
+
+/*
+ * Get CPU socket id (NUMA node) for a logical core
+ */
+unsigned
+eal_cpu_core_id(unsigned int lcore_id)
+{
+ return wcpu_map.wlcore_map[lcore_id].core_id;
+}
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#include <io.h>
+
+#include <rte_atomic.h>
+#include <rte_debug.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_per_lcore.h>
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <eal_thread.h>
+
+#include "eal_private.h"
+
+RTE_DEFINE_PER_LCORE(unsigned int, _lcore_id) = LCORE_ID_ANY;
+RTE_DEFINE_PER_LCORE(unsigned int, _socket_id) = (unsigned int)SOCKET_ID_ANY;
+RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
+
+/*
+ * Send a message to a slave lcore identified by slave_id to call a
+ * function f with argument arg. Once the execution is done, the
+ * remote lcore switch in FINISHED state.
+ */
+int
+rte_eal_remote_launch(lcore_function_t *f, void *arg, unsigned int slave_id)
+{
+ int n;
+ char c = 0;
+ int m2s = lcore_config[slave_id].pipe_master2slave[1];
+ int s2m = lcore_config[slave_id].pipe_slave2master[0];
+
+ if (lcore_config[slave_id].state != WAIT)
+ return -EBUSY;
+
+ lcore_config[slave_id].f = f;
+ lcore_config[slave_id].arg = arg;
+
+ /* send message */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = _write(m2s, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ /* wait ack */
+ do {
+ n = _read(s2m, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ return 0;
+}
+
+void
+eal_thread_init_master(unsigned int lcore_id)
+{
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+}
+
+static inline pthread_t
+eal_thread_self(void)
+{
+ return GetCurrentThreadId();
+}
+
+/* main loop of threads */
+void *
+eal_thread_loop(void *arg __rte_unused)
+{
+ char c;
+ int n, ret;
+ unsigned int lcore_id;
+ pthread_t thread_id;
+ int m2s, s2m;
+ char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+
+ thread_id = eal_thread_self();
+
+ /* retrieve our lcore_id from the configuration structure */
+ RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+ if (thread_id == lcore_config[lcore_id].thread_id)
+ break;
+ }
+ if (lcore_id == RTE_MAX_LCORE)
+ rte_panic("cannot retrieve lcore id\n");
+
+ m2s = lcore_config[lcore_id].pipe_master2slave[0];
+ s2m = lcore_config[lcore_id].pipe_slave2master[1];
+
+ /* set the lcore ID in per-lcore memory area */
+ RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+ RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s])\n",
+ lcore_id, (uintptr_t)thread_id, cpuset);
+
+ /* read on our pipe to get commands */
+ while (1) {
+ void *fct_arg;
+
+ /* wait command */
+ do {
+ n = _read(m2s, &c, 1);
+ } while (n < 0 && errno == EINTR);
+
+ if (n <= 0)
+ rte_panic("cannot read on configuration pipe\n");
+
+ lcore_config[lcore_id].state = RUNNING;
+
+ /* send ack */
+ n = 0;
+ while (n == 0 || (n < 0 && errno == EINTR))
+ n = _write(s2m, &c, 1);
+ if (n < 0)
+ rte_panic("cannot write on configuration pipe\n");
+
+ if (lcore_config[lcore_id].f == NULL)
+ rte_panic("NULL function pointer\n");
+
+ /* call the function and store the return value */
+ fct_arg = lcore_config[lcore_id].arg;
+ ret = lcore_config[lcore_id].f(fct_arg);
+ lcore_config[lcore_id].ret = ret;
+ rte_wmb();
+
+ /* when a service core returns, it should go directly to WAIT
+ * state, because the application will not lcore_wait() for it.
+ */
+ if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
+ lcore_config[lcore_id].state = WAIT;
+ else
+ lcore_config[lcore_id].state = FINISHED;
+ }
+}
+
+/* function to create threads */
+int
+eal_thread_create(pthread_t *thread)
+{
+ HANDLE th;
+
+ th = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)eal_thread_loop,
+ NULL, 0, (LPDWORD)thread);
+ if (!th)
+ return -1;
+
+ SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
+ SetThreadPriority(th, THREAD_PRIORITY_TIME_CRITICAL);
+
+ return 0;
+}
+
+int
+rte_thread_setname(__rte_unused pthread_t id, __rte_unused const char *name)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+ return 0;
+}
--- /dev/null
+/* SPDX-License-Identifier: ISC AND BSD-2-Clause
+ * Copyright (c) 2002 Todd C. Miller <Todd.Miller@courtesan.com>
+ *
+ * Sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F39502-99-1-0512.
+ */
+/*
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ */
+
+#include <getopt.h>
+
+#ifdef NEED_USUAL_GETOPT
+
+#include <string.h>
+#include <stdlib.h>
+
+const char *optarg; /* argument associated with option */
+int opterr = 1; /* if error message should be printed */
+int optind = 1; /* index into parent argv vector */
+int optopt = '?'; /* character checked for validity */
+
+static void pass(void) {}
+#define warnx(a, ...) pass()
+
+#define PRINT_ERROR ((opterr) && (*options != ':'))
+
+#define FLAG_PERMUTE 0x01 /* permute non-options to the end of argv */
+#define FLAG_ALLARGS 0x02 /* treat non-options as args to option "-1" */
+#define FLAG_LONGONLY 0x04 /* operate as getopt_long_only */
+
+/* return values */
+#define BADCH ((int)'?')
+#define BADARG ((*options == ':') ? (int)':' : (int)'?')
+#define INORDER 1
+
+#define EMSG ""
+
+static const char *place = EMSG; /* option letter processing */
+
+/* XXX: set optreset to 1 rather than these two */
+static int nonopt_start = -1; /* first non option argument (for permute) */
+static int nonopt_end = -1; /* first option after non options (for permute) */
+
+/* Error messages */
+static const char recargchar[] = "option requires an argument -- %c";
+static const char recargstring[] = "option requires an argument -- %s";
+static const char ambig[] = "ambiguous option -- %.*s";
+static const char noarg[] = "option doesn't take an argument -- %.*s";
+static const char illoptchar[] = "unknown option -- %c";
+static const char illoptstring[] = "unknown option -- %s";
+
+/*
+ * Compute the greatest common divisor of a and b.
+ */
+static int
+gcd(int a, int b)
+{
+ int c;
+
+ c = a % b;
+ while (c != 0) {
+ a = b;
+ b = c;
+ c = a % b;
+ }
+
+ return (b);
+}
+
+/*
+ * Exchange the block from nonopt_start to nonopt_end with the block
+ * from nonopt_end to opt_end (keeping the same order of arguments
+ * in each block).
+ */
+static void
+permute_args(int panonopt_start, int panonopt_end, int opt_end,
+ char **nargv)
+{
+ int cstart, cyclelen, i, j, ncycle, nnonopts, nopts, pos;
+ char *swap;
+
+ /*
+ * compute lengths of blocks and number and size of cycles
+ */
+ nnonopts = panonopt_end - panonopt_start;
+ nopts = opt_end - panonopt_end;
+ ncycle = gcd(nnonopts, nopts);
+ cyclelen = (opt_end - panonopt_start) / ncycle;
+
+ for (i = 0; i < ncycle; i++) {
+ cstart = panonopt_end+i;
+ pos = cstart;
+ for (j = 0; j < cyclelen; j++) {
+ if (pos >= panonopt_end)
+ pos -= nnonopts;
+ else
+ pos += nopts;
+ swap = nargv[pos];
+ /* LINTED const cast */
+ ((char **) nargv)[pos] = nargv[cstart];
+ /* LINTED const cast */
+ ((char **)nargv)[cstart] = swap;
+ }
+ }
+}
+
+/*
+ * parse_long_options --
+ * Parse long options in argc/argv argument vector.
+ * Returns -1 if short_too is set and the option does not match long_options.
+ */
+static int
+parse_long_options(char **nargv, const char *options,
+ const struct option *long_options, int *idx, int short_too)
+{
+ const char *current_argv;
+ char *has_equal;
+ size_t current_argv_len;
+ int i, match;
+
+ current_argv = place;
+ match = -1;
+
+ optind++;
+
+ has_equal = strchr(current_argv, '=');
+ if (has_equal != NULL) {
+ /* argument found (--option=arg) */
+ current_argv_len = has_equal - current_argv;
+ has_equal++;
+ } else
+ current_argv_len = strlen(current_argv);
+
+ for (i = 0; long_options[i].name; i++) {
+ /* find matching long option */
+ if (strncmp(current_argv, long_options[i].name,
+ current_argv_len))
+ continue;
+
+ if (strlen(long_options[i].name) == current_argv_len) {
+ /* exact match */
+ match = i;
+ break;
+ }
+ /*
+ * If this is a known short option, don't allow
+ * a partial match of a single character.
+ */
+ if (short_too && current_argv_len == 1)
+ continue;
+
+ if (match == -1) /* partial match */
+ match = i;
+ else {
+ /* ambiguous abbreviation */
+ if (PRINT_ERROR)
+ warnx(ambig, (int)current_argv_len,
+ current_argv);
+ optopt = 0;
+ return BADCH;
+ }
+ }
+ if (match != -1) { /* option found */
+ if (long_options[match].has_arg == no_argument
+ && has_equal) {
+ if (PRINT_ERROR)
+ warnx(noarg, (int)current_argv_len,
+ current_argv);
+ /*
+ * XXX: GNU sets optopt to val regardless of flag
+ */
+ if (long_options[match].flag == NULL)
+ optopt = long_options[match].val;
+ else
+ optopt = 0;
+ return BADARG;
+ }
+ if (long_options[match].has_arg == required_argument ||
+ long_options[match].has_arg == optional_argument) {
+ if (has_equal)
+ optarg = has_equal;
+ else if (long_options[match].has_arg ==
+ required_argument) {
+ /*
+ * optional argument doesn't use next nargv
+ */
+ optarg = nargv[optind++];
+ }
+ }
+ if ((long_options[match].has_arg == required_argument)
+ && (optarg == NULL)) {
+ /*
+ * Missing argument; leading ':' indicates no error
+ * should be generated.
+ */
+ if (PRINT_ERROR)
+ warnx(recargstring,
+ current_argv);
+ /*
+ * XXX: GNU sets optopt to val regardless of flag
+ */
+ if (long_options[match].flag == NULL)
+ optopt = long_options[match].val;
+ else
+ optopt = 0;
+ --optind;
+ return BADARG;
+ }
+ } else { /* unknown option */
+ if (short_too) {
+ --optind;
+ return (-1);
+ }
+ if (PRINT_ERROR)
+ warnx(illoptstring, current_argv);
+ optopt = 0;
+ return BADCH;
+ }
+ if (idx)
+ *idx = match;
+ if (long_options[match].flag) {
+ *long_options[match].flag = long_options[match].val;
+ return 0;
+ } else
+ return (long_options[match].val);
+}
+
+/*
+ * getopt_internal --
+ * Parse argc/argv argument vector. Called by user level routines.
+ */
+static int
+getopt_internal(int nargc, char **nargv, const char *options,
+ const struct option *long_options, int *idx, int flags)
+{
+ char *oli; /* option letter list index */
+ int optchar, short_too;
+ static int posixly_correct = -1;
+ char *buf;
+ size_t len;
+ int optreset = 0;
+
+ if (options == NULL)
+ return (-1);
+
+ /*
+ * Disable GNU extensions if POSIXLY_CORRECT is set or options
+ * string begins with a '+'.
+ */
+ if (posixly_correct == -1)
+ posixly_correct = _dupenv_s(&buf, &len, "POSIXLY_CORRECT");
+ if (!posixly_correct || *options == '+')
+ flags &= ~FLAG_PERMUTE;
+ else if (*options == '-')
+ flags |= FLAG_ALLARGS;
+ if (*options == '+' || *options == '-')
+ options++;
+ if (!posixly_correct)
+ free(buf);
+ /*
+ * reset if requested
+ */
+ if (optind == 0)
+ optind = optreset = 1;
+
+ optarg = NULL;
+ if (optreset)
+ nonopt_start = nonopt_end = -1;
+start:
+ if (optreset || !*place) { /* update scanning pointer */
+ optreset = 0;
+ if (optind >= nargc) { /* end of argument vector */
+ place = EMSG;
+ if (nonopt_end != -1) {
+ /* do permutation, if we have to */
+ permute_args(nonopt_start, nonopt_end,
+ optind, nargv);
+ optind -= nonopt_end - nonopt_start;
+ } else if (nonopt_start != -1) {
+ /*
+ * If we skipped non-options, set optind
+ * to the first of them.
+ */
+ optind = nonopt_start;
+ }
+ nonopt_start = nonopt_end = -1;
+ return (-1);
+ }
+ place = nargv[optind];
+ if (*place != '-' ||
+ (place[1] == '\0' && strchr(options, '-') == NULL)) {
+ place = EMSG; /* found non-option */
+ if (flags & FLAG_ALLARGS) {
+ /*
+ * GNU extension:
+ * return non-option as argument to option 1
+ */
+ optarg = nargv[optind++];
+ return INORDER;
+ }
+ if (!(flags & FLAG_PERMUTE)) {
+ /*
+ * If no permutation wanted, stop parsing
+ * at first non-option.
+ */
+ return (-1);
+ }
+ /* do permutation */
+ if (nonopt_start == -1)
+ nonopt_start = optind;
+ else if (nonopt_end != -1) {
+ permute_args(nonopt_start, nonopt_end,
+ optind, nargv);
+ nonopt_start = optind -
+ (nonopt_end - nonopt_start);
+ nonopt_end = -1;
+ }
+ optind++;
+ /* process next argument */
+ goto start;
+ }
+ if (nonopt_start != -1 && nonopt_end == -1)
+ nonopt_end = optind;
+
+ /*
+ * If we have "-" do nothing, if "--" we are done.
+ */
+ if (place[1] != '\0' && *++place == '-' && place[1] == '\0') {
+ optind++;
+ place = EMSG;
+ /*
+ * We found an option (--), so if we skipped
+ * non-options, we have to permute.
+ */
+ if (nonopt_end != -1) {
+ permute_args(nonopt_start, nonopt_end,
+ optind, nargv);
+ optind -= nonopt_end - nonopt_start;
+ }
+ nonopt_start = nonopt_end = -1;
+ return (-1);
+ }
+ }
+
+ /*
+ * Check long options if:
+ * 1) we were passed some
+ * 2) the arg is not just "-"
+ * 3) either the arg starts with -- we are getopt_long_only()
+ */
+ if (long_options != NULL && place != nargv[optind] &&
+ (*place == '-' || (flags & FLAG_LONGONLY))) {
+ short_too = 0;
+ if (*place == '-')
+ place++; /* --foo long option */
+ else if (*place != ':' && strchr(options, *place) != NULL)
+ short_too = 1; /* could be short option too */
+
+ optchar = parse_long_options(nargv, options, long_options,
+ idx, short_too);
+ if (optchar != -1) {
+ place = EMSG;
+ return optchar;
+ }
+ }
+
+ optchar = (int)*place++;
+ oli = strchr(options, optchar);
+ if (optchar == (int)':' ||
+ (optchar == (int)'-' && *place != '\0') ||
+ oli == NULL) {
+ /*
+ * If the user specified "-" and '-' isn't listed in
+ * options, return -1 (non-option) as per POSIX.
+ * Otherwise, it is an unknown option character (or ':').
+ */
+ if (optchar == (int)'-' && *place == '\0')
+ return (-1);
+ if (!*place)
+ ++optind;
+ if (PRINT_ERROR)
+ warnx(illoptchar, optchar);
+ optopt = optchar;
+ return BADCH;
+ }
+ if (long_options != NULL && optchar == 'W' && oli[1] == ';') {
+ /* -W long-option */
+ if (*place)
+ ;
+ else if (++optind >= nargc) { /* no arg */
+ place = EMSG;
+ if (PRINT_ERROR)
+ warnx(recargchar, optchar);
+ optopt = optchar;
+ return BADARG;
+ } /* white space */
+ place = nargv[optind];
+ optchar = parse_long_options(nargv, options, long_options,
+ idx, 0);
+ place = EMSG;
+ return optchar;
+ }
+ if (*++oli != ':') { /* doesn't take argument */
+ if (!*place)
+ ++optind;
+ } else { /* takes (optional) argument */
+ optarg = NULL;
+ if (*place) /* no white space */
+ optarg = place;
+ else if (oli[1] != ':') { /* arg not optional */
+ if (++optind >= nargc) { /* no arg */
+ place = EMSG;
+ if (PRINT_ERROR)
+ warnx(recargchar, optchar);
+ optopt = optchar;
+ return BADARG;
+ }
+ optarg = nargv[optind];
+ }
+ place = EMSG;
+ ++optind;
+ }
+ /* dump back option letter */
+ return optchar;
+}
+
+/*
+ * getopt --
+ * Parse argc/argv argument vector.
+ */
+int
+getopt(int nargc, char *nargv[], const char *options)
+{
+ return getopt_internal(nargc, nargv, options, NULL, NULL,
+ FLAG_PERMUTE);
+}
+
+/*
+ * getopt_long --
+ * Parse argc/argv argument vector.
+ */
+int
+getopt_long(int nargc, char *nargv[], const char *options,
+ const struct option *long_options, int *idx)
+{
+
+ return (getopt_internal(nargc, nargv, options, long_options, idx,
+ FLAG_PERMUTE));
+}
+
+/*
+ * getopt_long_only --
+ * Parse argc/argv argument vector.
+ */
+int
+getopt_long_only(int nargc, char *nargv[], const char *options,
+ const struct option *long_options, int *idx)
+{
+
+ return (getopt_internal(nargc, nargv, options, long_options, idx,
+ FLAG_PERMUTE|FLAG_LONGONLY));
+}
+
+#endif /* NEED_USUAL_GETOPT */
--- /dev/null
+/* SPDX-License-Identifier: MIT
+ * Dirent interface for Microsoft Visual Studio
+ * Version 1.21
+ * Copyright (C) 2006-2012 Toni Ronkko
+ * https://github.com/tronkko/dirent
+ */
+
+#ifndef DIRENT_H
+#define DIRENT_H
+
+/*
+ * Include windows.h without Windows Sockets 1.1 to prevent conflicts with
+ * Windows Sockets 2.0.
+ */
+#ifndef WIN32_LEAN_AND_MEAN
+# define WIN32_LEAN_AND_MEAN
+#endif
+
+#include <windows.h>
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <wchar.h>
+#include <string.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <errno.h>
+
+/* Maximum length of file name */
+#if !defined(PATH_MAX)
+# define PATH_MAX MAX_PATH
+#endif
+
+/* File type flags for d_type */
+#define DT_UNKNOWN 0
+#define DT_REG S_IFREG
+#define DT_DIR S_IFDIR
+#define DT_CHR S_IFCHR
+
+/*
+ * File type macros. Note that block devices, sockets and links cannot be
+ * distinguished on Windows and the macros S_ISBLK, S_ISSOCK and S_ISLNK are
+ * only defined for compatibility. These macros should always return false
+ * on Windows.
+ */
+#if !defined(S_ISDIR)
+# define S_ISDIR(mode) (((mode) & S_IFMT) == S_IFDIR)
+#endif
+#if !defined(S_ISREG)
+# define S_ISREG(mode) (((mode) & S_IFMT) == S_IFREG)
+#endif
+
+/* Wide-character version */
+struct _wdirent {
+ /* Always zero */
+ long d_ino;
+
+ /* Structure size */
+ unsigned short d_reclen;
+
+ /* Length of name without \0 */
+ size_t d_namlen;
+
+ /* File type */
+ int d_type;
+
+ /* File name */
+ wchar_t d_name[PATH_MAX];
+};
+typedef struct _wdirent _wdirent;
+
+struct _WDIR {
+ /* Current directory entry */
+ struct _wdirent ent;
+
+ /* Private file data */
+ WIN32_FIND_DATAW data;
+
+ /* True if data is valid */
+ int cached;
+
+ /* Win32 search handle */
+ HANDLE handle;
+
+ /* Initial directory name */
+ wchar_t *patt;
+};
+typedef struct _WDIR _WDIR;
+
+static _WDIR *_wopendir(const wchar_t *dirname);
+static int _wclosedir(_WDIR *dirp);
+
+/* For compatibility with Symbian */
+#define wdirent _wdirent
+#define WDIR _WDIR
+#define wopendir _wopendir
+#define wclosedir _wclosedir
+
+/* Multi-byte character versions */
+struct dirent {
+ /* Always zero */
+ long d_ino;
+
+ /* Structure size */
+ unsigned short d_reclen;
+
+ /* Length of name without \0 */
+ size_t d_namlen;
+
+ /* File type */
+ int d_type;
+
+ /* File name */
+ char d_name[PATH_MAX];
+};
+typedef struct dirent dirent;
+
+struct DIR {
+ struct dirent ent;
+ struct _WDIR *wdirp;
+};
+typedef struct DIR DIR;
+
+static DIR *opendir(const char *dirname);
+static struct dirent *readdir(DIR *dirp);
+static int closedir(DIR *dirp);
+
+/* Internal utility functions */
+static WIN32_FIND_DATAW *dirent_first(_WDIR *dirp);
+static WIN32_FIND_DATAW *dirent_next(_WDIR *dirp);
+
+static int dirent_mbstowcs_s(
+ size_t *pReturnValue,
+ wchar_t *wcstr,
+ size_t sizeInWords,
+ const char *mbstr,
+ size_t count);
+
+static int dirent_wcstombs_s(
+ size_t *pReturnValue,
+ char *mbstr,
+ size_t sizeInBytes,
+ const wchar_t *wcstr,
+ size_t count);
+
+static void dirent_set_errno(int error);
+
+/*
+ * Open directory stream DIRNAME for read and return a pointer to the
+ * internal working area that is used to retrieve individual directory
+ * entries.
+ */
+static _WDIR*
+_wopendir(const wchar_t *dirname)
+{
+ _WDIR *dirp = NULL;
+ int error;
+
+ /* Must have directory name */
+ if (dirname == NULL || dirname[0] == '\0') {
+ dirent_set_errno(ENOENT);
+ return NULL;
+ }
+
+ /* Allocate new _WDIR structure */
+ dirp = (_WDIR *)malloc(sizeof(struct _WDIR));
+ if (dirp != NULL) {
+ DWORD n;
+
+ /* Reset _WDIR structure */
+ dirp->handle = INVALID_HANDLE_VALUE;
+ dirp->patt = NULL;
+ dirp->cached = 0;
+
+ /* Compute the length of full path plus zero terminator
+ *
+ * Note that on WinRT there's no way to convert relative paths
+ * into absolute paths, so just assume its an absolute path.
+ */
+ #if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)
+ n = wcslen(dirname);
+ #else
+ n = GetFullPathNameW(dirname, 0, NULL, NULL);
+ #endif
+
+ /* Allocate room for absolute directory name and search
+ * pattern
+ */
+ dirp->patt = (wchar_t *)malloc(sizeof(wchar_t) * n + 16);
+ if (dirp->patt) {
+ /* Convert relative directory name to an
+ * absolute one. This allows rewinddir() to
+ * function correctly even when current working
+ * directory is changed between opendir()
+ * and rewinddir().
+ *
+ * Note that on WinRT there's no way to convert
+ * relative paths into absolute paths, so just
+ * assume its an absolute path.
+ */
+ #if defined(WINAPI_FAMILY) && \
+ (WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)
+ wcsncpy_s(dirp->patt, n + 1, dirname, n);
+ #else
+ n = GetFullPathNameW(dirname, n, dirp->patt, NULL);
+ #endif
+ if (n > 0) {
+ wchar_t *p;
+
+ /* Append search pattern \* to the directory
+ * name
+ */
+ p = dirp->patt + n;
+ if (dirp->patt < p) {
+ switch (p[-1]) {
+ case '\\':
+ case '/':
+ case ':':
+ /* Directory ends in path separator,
+ * e.g.c:\temp\
+ */
+ /*NOP*/;
+ break;
+
+ default:
+ /* Directory name doesn't end in path
+ * separator
+ */
+ *p++ = '\\';
+ }
+ }
+ *p++ = '*';
+ *p = '\0';
+
+ /* Open directory stream and retrieve the first
+ * entry
+ */
+ if (dirent_first(dirp)) {
+ /* Directory stream opened successfully */
+ error = 0;
+ } else {
+ /* Cannot retrieve first entry */
+ error = 1;
+ dirent_set_errno(ENOENT);
+ }
+
+ } else {
+ /* Cannot retrieve full path name */
+ dirent_set_errno(ENOENT);
+ error = 1;
+ }
+
+ } else {
+ /* Cannot allocate memory for search pattern */
+ error = 1;
+ }
+
+ } else {
+ /* Cannot allocate _WDIR structure */
+ error = 1;
+ }
+
+ /* Clean up in case of error */
+ if (error && dirp) {
+ _wclosedir(dirp);
+ dirp = NULL;
+ }
+
+ return dirp;
+}
+
+/*
+ * Close directory stream opened by opendir() function.
+ * This invalidates the DIR structure as well as any directory
+ * entry read previously by _wreaddir().
+ */
+static int
+_wclosedir(_WDIR *dirp)
+{
+ int ok;
+ if (dirp) {
+
+ /* Release search handle */
+ if (dirp->handle != INVALID_HANDLE_VALUE) {
+ FindClose(dirp->handle);
+ dirp->handle = INVALID_HANDLE_VALUE;
+ }
+
+ /* Release search pattern */
+ if (dirp->patt) {
+ free(dirp->patt);
+ dirp->patt = NULL;
+ }
+
+ /* Release directory structure */
+ free(dirp);
+ ok = /*success*/0;
+
+ } else {
+ /* Invalid directory stream */
+ dirent_set_errno(EBADF);
+ ok = /*failure*/-1;
+ }
+ return ok;
+}
+
+/* Get first directory entry (internal) */
+static WIN32_FIND_DATAW*
+dirent_first(_WDIR *dirp)
+{
+ WIN32_FIND_DATAW *datap;
+
+ /* Open directory and retrieve the first entry */
+ dirp->handle = FindFirstFileExW(
+ dirp->patt, FindExInfoStandard, &dirp->data,
+ FindExSearchNameMatch, NULL, 0);
+ if (dirp->handle != INVALID_HANDLE_VALUE) {
+
+ /* a directory entry is now waiting in memory */
+ datap = &dirp->data;
+ dirp->cached = 1;
+
+ } else {
+
+ /* Failed to re-open directory: no directory entry in memory */
+ dirp->cached = 0;
+ datap = NULL;
+
+ }
+ return datap;
+}
+
+/* Get next directory entry (internal) */
+static WIN32_FIND_DATAW*
+dirent_next(_WDIR *dirp)
+{
+ WIN32_FIND_DATAW *p;
+
+ /* Get next directory entry */
+ if (dirp->cached != 0) {
+
+ /* A valid directory entry already in memory */
+ p = &dirp->data;
+ dirp->cached = 0;
+
+ } else if (dirp->handle != INVALID_HANDLE_VALUE) {
+
+ /* Get the next directory entry from stream */
+ if (FindNextFileW(dirp->handle, &dirp->data) != FALSE) {
+ /* Got a file */
+ p = &dirp->data;
+ } else {
+ /* The very last entry has been processed
+ *or an error occurred
+ */
+ FindClose(dirp->handle);
+ dirp->handle = INVALID_HANDLE_VALUE;
+ p = NULL;
+ }
+
+ } else {
+
+ /* End of directory stream reached */
+ p = NULL;
+
+ }
+
+ return p;
+}
+
+/*
+ * Open directory stream using plain old C-string.
+ */
+static DIR*
+opendir(const char *dirname)
+{
+ struct DIR *dirp;
+ int error;
+
+ /* Must have directory name */
+ if (dirname == NULL || dirname[0] == '\0') {
+ dirent_set_errno(ENOENT);
+ return NULL;
+ }
+
+ /* Allocate memory for DIR structure */
+ dirp = (DIR *)malloc(sizeof(struct DIR));
+ if (dirp) {
+ wchar_t wname[PATH_MAX];
+ size_t n;
+
+ /* Convert directory name to wide-character string */
+ error = dirent_mbstowcs_s(&n, wname, PATH_MAX,
+ dirname, PATH_MAX);
+ if (!error) {
+
+ /* Open directory stream using wide-character name */
+ dirp->wdirp = _wopendir(wname);
+ if (dirp->wdirp) {
+ /* Directory stream opened */
+ error = 0;
+ } else {
+ /* Failed to open directory stream */
+ error = 1;
+ }
+
+ } else {
+ /*
+ * Cannot convert file name to wide-character string.
+ * This occurs if the string contains invalid multi-byte
+ * sequences or the output buffer is too small to
+ * contain the resulting string.
+ */
+ error = 1;
+ }
+
+ } else {
+ /* Cannot allocate DIR structure */
+ error = 1;
+ }
+
+ /* Clean up in case of error */
+ if (error && dirp) {
+ free(dirp);
+ dirp = NULL;
+ }
+
+ return dirp;
+}
+
+/*
+ * Read next directory entry.
+ *
+ * When working with text consoles, please note that file names
+ * returned by readdir() are represented in the default ANSI code
+ * page while any output toconsole is typically formatted on another
+ * code page. Thus, non-ASCII characters in file names will not usually
+ * display correctly on console. The problem can be fixed in two ways:
+ * (1) change the character set of console to 1252 using chcp utility
+ * and use Lucida Console font, or (2) use _cprintf function when
+ * writing to console. The _cprinf() will re-encode ANSI strings to the
+ * console code page so many non-ASCII characters will display correctly.
+ */
+static struct dirent*
+readdir(DIR *dirp)
+{
+ WIN32_FIND_DATAW *datap;
+ struct dirent *entp;
+
+ /* Read next directory entry */
+ datap = dirent_next(dirp->wdirp);
+ if (datap) {
+ size_t n;
+ int error;
+
+ /* Attempt to convert file name to multi-byte string */
+ error = dirent_wcstombs_s(&n, dirp->ent.d_name,
+ PATH_MAX, datap->cFileName, PATH_MAX);
+
+ /*
+ * If the file name cannot be represented by a multi-byte
+ * string, then attempt to use old 8+3 file name.
+ * This allows traditional Unix-code to access some file
+ * names despite of unicode characters, although file names
+ * may seem unfamiliar to the user.
+ *
+ * Be ware that the code below cannot come up with a short
+ * file name unless the file system provides one. At least
+ * VirtualBox shared folders fail to do this.
+ */
+ if (error && datap->cAlternateFileName[0] != '\0') {
+ error = dirent_wcstombs_s(
+ &n, dirp->ent.d_name, PATH_MAX,
+ datap->cAlternateFileName, PATH_MAX);
+ }
+
+ if (!error) {
+ DWORD attr;
+
+ /* Initialize directory entry for return */
+ entp = &dirp->ent;
+
+ /* Length of file name excluding zero terminator */
+ entp->d_namlen = n - 1;
+
+ /* File attributes */
+ attr = datap->dwFileAttributes;
+ if ((attr & FILE_ATTRIBUTE_DEVICE) != 0)
+ entp->d_type = DT_CHR;
+ else if ((attr & FILE_ATTRIBUTE_DIRECTORY) != 0)
+ entp->d_type = DT_DIR;
+ else
+ entp->d_type = DT_REG;
+
+ /* Reset dummy fields */
+ entp->d_ino = 0;
+ entp->d_reclen = sizeof(struct dirent);
+
+ } else {
+ /*
+ * Cannot convert file name to multi-byte string so
+ * construct an erroneous directory entry and return
+ * that. Note that we cannot return NULL as that would
+ * stop the processing of directory entries completely.
+ */
+ entp = &dirp->ent;
+ entp->d_name[0] = '?';
+ entp->d_name[1] = '\0';
+ entp->d_namlen = 1;
+ entp->d_type = DT_UNKNOWN;
+ entp->d_ino = 0;
+ entp->d_reclen = 0;
+ }
+
+ } else {
+ /* No more directory entries */
+ entp = NULL;
+ }
+
+ return entp;
+}
+
+/*
+ * Close directory stream.
+ */
+static int
+closedir(DIR *dirp)
+{
+ int ok;
+ if (dirp) {
+
+ /* Close wide-character directory stream */
+ ok = _wclosedir(dirp->wdirp);
+ dirp->wdirp = NULL;
+
+ /* Release multi-byte character version */
+ free(dirp);
+
+ } else {
+
+ /* Invalid directory stream */
+ dirent_set_errno(EBADF);
+ ok = /*failure*/-1;
+
+ }
+ return ok;
+}
+
+/* Convert multi-byte string to wide character string */
+static int
+dirent_mbstowcs_s(
+ size_t *pReturnValue,
+ wchar_t *wcstr,
+ size_t sizeInWords,
+ const char *mbstr,
+ size_t count)
+{
+ int error;
+
+ #if defined(_MSC_VER) && _MSC_VER >= 1400
+ /* Microsoft Visual Studio 2005 or later */
+ error = mbstowcs_s(pReturnValue, wcstr,
+ sizeInWords, mbstr, count);
+ #else
+
+ /* Older Visual Studio or non-Microsoft compiler */
+ size_t n;
+
+ /* Convert to wide-character string (or count characters) */
+ n = mbstowcs(wcstr, mbstr, sizeInWords);
+ if (!wcstr || n < count) {
+
+ /* Zero-terminate output buffer */
+ if (wcstr && sizeInWords) {
+ if (n >= sizeInWords)
+ n = sizeInWords - 1;
+ wcstr[n] = 0;
+ }
+
+ /* Length of resuting multi-byte string WITH zero
+ *terminator
+ */
+ if (pReturnValue)
+ *pReturnValue = n + 1;
+
+ /* Success */
+ error = 0;
+
+ } else {
+
+ /* Could not convert string */
+ error = 1;
+
+ }
+ #endif
+
+ return error;
+}
+
+/* Convert wide-character string to multi-byte string */
+static int
+dirent_wcstombs_s(
+ size_t *pReturnValue,
+ char *mbstr,
+ size_t sizeInBytes, /* max size of mbstr */
+ const wchar_t *wcstr,
+ size_t count)
+{
+ int error;
+
+ #if defined(_MSC_VER) && _MSC_VER >= 1400
+ /* Microsoft Visual Studio 2005 or later */
+ error = wcstombs_s(pReturnValue, mbstr, sizeInBytes, wcstr, count);
+ #else
+ /* Older Visual Studio or non-Microsoft compiler */
+ size_t n;
+
+ /* Convert to multi-byte string
+ * (or count the number of bytes needed)
+ */
+ n = wcstombs(mbstr, wcstr, sizeInBytes);
+ if (!mbstr || n < count) {
+ /* Zero-terminate output buffer */
+ if (mbstr && sizeInBytes) {
+ if (n >= sizeInBytes)
+ n = sizeInBytes - 1;
+ mbstr[n] = '\0';
+ }
+ /* Length of resulting multi-bytes string WITH
+ *zero-terminator
+ */
+ if (pReturnValue)
+ *pReturnValue = n + 1;
+ /* Success */
+ error = 0;
+ } else {
+ /* Cannot convert string */
+ error = 1;
+ }
+ #endif
+
+ return error;
+}
+
+/* Set errno variable */
+static void
+dirent_set_errno(int error)
+{
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+ /* Microsoft Visual Studio 2005 and later */
+ _set_errno(error);
+#else
+
+ /* Non-Microsoft compiler or older Microsoft compiler */
+ errno = error;
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*DIRENT_H*/
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _FNMATCH_H_
+#define _FNMATCH_H_
+
+/**
+ * This file is required to support the common code in eal_common_log.c
+ * as Microsoft libc does not contain fnmatch.h. This may be removed in
+ * future releases.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FNM_NOMATCH 1
+
+/**
+ * This function is used for searhing a given string source
+ * with the given regular expression pattern.
+ *
+ * @param pattern
+ * regular expression notation decribing the pattern to match
+ *
+ * @param string
+ * source string to searcg for the pattern
+ *
+ * @param flag
+ * containing information about the pattern
+ *
+ * @return
+ * if the pattern is found then return 0 or else FNM_NOMATCH
+ */
+static inline int fnmatch(__rte_unused const char *pattern,
+ __rte_unused const char *string,
+ __rte_unused int flags)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+ return FNM_NOMATCH;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FNMATCH_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-2-Clause
+ * Copyright (c) 2000 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Dieter Baron and Thomas Klausner.
+ */
+
+/**
+ * @file
+ * getopt compat.
+ *
+ * This module provides getopt() and getopt_long().
+ */
+
+#ifndef _USUAL_GETOPT_H_
+#define _USUAL_GETOPT_H_
+
+#ifndef NEED_USUAL_GETOPT
+#if !defined(HAVE_GETOPT_H) || !defined(HAVE_GETOPT) || \
+ !defined(HAVE_GETOPT_LONG)
+#define NEED_USUAL_GETOPT
+#endif
+#endif
+
+#ifndef NEED_USUAL_GETOPT
+
+/* Use system getopt */
+#ifdef RTE_TOOLCHAIN_GCC
+#include_next <getopt.h>
+#else
+#include <getopt.h>
+#endif
+
+#else /* NEED_USUAL_GETOPT */
+
+/* avoid name collision */
+#define optarg usual_optarg
+#define opterr usual_opterr
+#define optind usual_optind
+#define optopt usual_optopt
+#define getopt(a, b, c) usual_getopt(a, b, c)
+#define getopt_long(a, b, c, d, e) usual_getopt_long(a, b, c, d, e)
+
+
+/** argument to current option, or NULL if it has none */
+extern const char *optarg;
+/** Current position in arg string. Starts from 1.
+ * Setting to 0 resets state.
+ */
+extern int optind;
+/** whether getopt() should print error messages on problems. Default: 1. */
+extern int opterr;
+/** Option char which caused error */
+extern int optopt;
+
+/** long option takes no argument */
+#define no_argument 0
+/** long option requires argument */
+#define required_argument 1
+/** long option has optional argument */
+#define optional_argument 2
+
+/** Long option description */
+struct option {
+ /** name of long option */
+ const char *name;
+
+ /**
+ * whether option takes an argument.
+ * One of no_argument, required_argument, and optional_argument.
+ */
+ int has_arg;
+
+ /** if not NULL, set *flag to val when option found */
+ int *flag;
+
+ /** if flag not NULL, value to set *flag to; else return value */
+ int val;
+};
+
+/** Compat: getopt */
+int getopt(int argc, char *argv[], const char *options);
+
+/** Compat: getopt_long */
+int getopt_long(int argc, char *argv[], const char *options,
+ const struct option *longopts, int *longindex);
+
+/** Compat: getopt_long_only */
+int getopt_long_only(int nargc, char *argv[], const char *options,
+ const struct option *long_options, int *idx);
+
+
+#endif /* NEED_USUAL_GETOPT */
+
+#endif /* !_USUAL_GETOPT_H_ */
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright 2020 Mellanox Technologies, Ltd
+
+includes += include_directories('.')
+
+headers += files(
+ 'rte_os.h',
+)
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _PTHREAD_H_
+#define _PTHREAD_H_
+
+/**
+ * This file is required to support the common code in eal_common_proc.c,
+ * eal_common_thread.c and common\include\rte_per_lcore.h as Microsoft libc
+ * does not contain pthread.h. This may be removed in future releases.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <windows.h>
+
+#define PTHREAD_BARRIER_SERIAL_THREAD TRUE
+
+/* defining pthread_t type on Windows since there is no in Microsoft libc*/
+typedef uintptr_t pthread_t;
+
+/* defining pthread_attr_t type on Windows since there is no in Microsoft libc*/
+typedef void *pthread_attr_t;
+
+typedef SYNCHRONIZATION_BARRIER pthread_barrier_t;
+
+#define pthread_barrier_init(barrier, attr, count) \
+ InitializeSynchronizationBarrier(barrier, count, -1)
+#define pthread_barrier_wait(barrier) EnterSynchronizationBarrier(barrier, \
+ SYNCHRONIZATION_BARRIER_FLAGS_BLOCK_ONLY)
+#define pthread_barrier_destroy(barrier) \
+ DeleteSynchronizationBarrier(barrier)
+#define pthread_cancel(thread) TerminateThread((HANDLE) thread, 0)
+
+/* pthread function overrides */
+#define pthread_self() \
+ ((pthread_t)GetCurrentThreadId())
+#define pthread_setaffinity_np(thread, size, cpuset) \
+ eal_set_thread_affinity_mask(thread, (unsigned long *) cpuset)
+#define pthread_getaffinity_np(thread, size, cpuset) \
+ eal_get_thread_affinity_mask(thread, (unsigned long *) cpuset)
+#define pthread_create(threadid, threadattr, threadfunc, args) \
+ eal_create_thread(threadid, threadfunc, args)
+
+static inline int
+eal_set_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset)
+{
+ SetThreadAffinityMask((HANDLE) threadid, *cpuset);
+ return 0;
+}
+
+static inline int
+eal_get_thread_affinity_mask(pthread_t threadid, unsigned long *cpuset)
+{
+ /* Workaround for the lack of a GetThreadAffinityMask()
+ *API in Windows
+ */
+ /* obtain previous mask by setting dummy mask */
+ DWORD dwprevaffinitymask =
+ SetThreadAffinityMask((HANDLE) threadid, 0x1);
+ /* set it back! */
+ SetThreadAffinityMask((HANDLE) threadid, dwprevaffinitymask);
+ *cpuset = dwprevaffinitymask;
+ return 0;
+}
+
+static inline int
+eal_create_thread(void *threadid, void *threadfunc, void *args)
+{
+ HANDLE hThread;
+ hThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)threadfunc,
+ args, 0, (LPDWORD)threadid);
+ if (hThread) {
+ SetPriorityClass(GetCurrentProcess(), REALTIME_PRIORITY_CLASS);
+ SetThreadPriority(hThread, THREAD_PRIORITY_TIME_CRITICAL);
+ }
+ return ((hThread != NULL) ? 0 : E_FAIL);
+}
+
+static inline int
+pthread_join(pthread_t thread __attribute__((__unused__)),
+ void **value_ptr __attribute__((__unused__)))
+{
+ return 0;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _PTHREAD_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _REGEX_H_
+#define _REGEX_H_
+
+/**
+ * This file is required to support the common code in eal_common_log.c
+ * as Microsoft libc does not contain regex.h. This may be removed in
+ * future releases.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define REG_NOMATCH 1
+#define REG_ESPACE 12
+
+#include <rte_common.h>
+
+/* defining regex_t for Windows */
+typedef void *regex_t;
+/* defining regmatch_t for Windows */
+typedef void *regmatch_t;
+
+/**
+ * The regcomp() function will compile the regular expression
+ * contained in the string pointed to by the pattern argument
+ * and place the results in the structure pointed to by preg.
+ * The cflags argument is the bitwise inclusive OR of zero or
+ * more of the flags
+ */
+static inline int regcomp(__rte_unused regex_t *preg,
+ __rte_unused const char *regex, __rte_unused int cflags)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+ return REG_ESPACE;
+}
+
+/**
+ * The regexec() function compares the null-terminated string
+ * specified by string with the compiled regular expression
+ * preg initialised by a previous call to regcomp(). If it finds
+ * a match, regexec() returns 0; otherwise it returns non-zero
+ * indicating either no match or an error. The eflags argument
+ * is the bitwise inclusive OR of zero or more of the flags.
+ */
+static inline int regexec(__rte_unused const regex_t *preg,
+ __rte_unused const char *string, __rte_unused size_t nmatch,
+ __rte_unused regmatch_t pmatch[], __rte_unused int eflags)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+ return REG_NOMATCH;
+}
+
+/**
+ * The regerror() function provides a mapping from error codes
+ * returned by regcomp() and regexec() to unspecified printable strings.
+ */
+static inline size_t regerror(__rte_unused int errcode,
+ __rte_unused const regex_t *preg, char *errbuf,
+ __rte_unused size_t errbuf_size)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+ if (errbuf) {
+ *errbuf = '\0';
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * The regfree() function frees any memory allocated by regcomp()
+ * associated with preg.
+ */
+static inline void regfree(__rte_unused regex_t *preg)
+{
+ /* TODO */
+ /* This is a stub, not the expected result */
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _REGEX_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2019 Intel Corporation
+ */
+
+#ifndef _RTE_OS_H_
+#define _RTE_OS_H_
+
+/**
+ * This is header should contain any function/macro definition
+ * which are not supported natively or named differently in the
+ * Windows OS. Functions will be added in future releases.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <windows.h>
+#include <basetsd.h>
+#include <pthread.h>
+#include <stdio.h>
+
+/* limits.h replacement */
+#include <stdlib.h>
+#ifndef PATH_MAX
+#define PATH_MAX _MAX_PATH
+#endif
+
+#define strerror_r(a, b, c) strerror_s(b, c, a)
+
+/* strdup is deprecated in Microsoft libc and _strdup is preferred */
+#define strdup(str) _strdup(str)
+
+typedef SSIZE_T ssize_t;
+
+#define strtok_r(str, delim, saveptr) strtok_s(str, delim, saveptr)
+
+#define index(a, b) strchr(a, b)
+#define rindex(a, b) strrchr(a, b)
+
+#define strncasecmp(s1, s2, count) _strnicmp(s1, s2, count)
+
+/**
+ * Create a thread.
+ * This function is private to EAL.
+ *
+ * @param thread
+ * The location to store the thread id if successful.
+ * @return
+ * 0 for success, -1 if the thread is not created.
+ */
+int eal_thread_create(pthread_t *thread);
+
+/**
+ * Create a map of processors and cores on the system.
+ * This function is private to EAL.
+ */
+void eal_create_cpu_map(void);
+
+#ifndef RTE_TOOLCHAIN_GCC
+static inline int
+asprintf(char **buffer, const char *format, ...)
+{
+ int size, ret;
+ va_list arg;
+
+ va_start(arg, format);
+ size = vsnprintf(NULL, 0, format, arg);
+ va_end(arg);
+ if (size < 0)
+ return -1;
+ size++;
+
+ *buffer = malloc(size);
+ if (*buffer == NULL)
+ return -1;
+
+ va_start(arg, format);
+ ret = vsnprintf(*buffer, size, format, arg);
+ va_end(arg);
+ if (ret != size - 1) {
+ free(*buffer);
+ return -1;
+ }
+ return ret;
+}
+#endif /* RTE_TOOLCHAIN_GCC */
+
+/* cpu_set macros implementation */
+#define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
+#define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
+#define RTE_CPU_FILL(set) CPU_FILL(set)
+#define RTE_CPU_NOT(dst, src) CPU_NOT(dst, src)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RTE_OS_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _SCHED_H_
+#define _SCHED_H_
+
+/**
+ * This file is added to support the common code in eal_common_thread.c
+ * as Microsoft libc does not contain sched.h. This may be removed
+ * in future releases.
+ */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CPU_SETSIZE
+#define CPU_SETSIZE RTE_MAX_LCORE
+#endif
+
+#define _BITS_PER_SET (sizeof(long long) * 8)
+#define _BIT_SET_MASK (_BITS_PER_SET - 1)
+
+#define _NUM_SETS(b) (((b) + _BIT_SET_MASK) / _BITS_PER_SET)
+#define _WHICH_SET(b) ((b) / _BITS_PER_SET)
+#define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1))
+
+typedef struct _rte_cpuset_s {
+ long long _bits[_NUM_SETS(CPU_SETSIZE)];
+} rte_cpuset_t;
+
+#define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b)))
+
+#define CPU_ZERO(s) \
+ do { \
+ unsigned int _i; \
+ \
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
+ (s)->_bits[_i] = 0LL; \
+ } while (0)
+
+#define CPU_ISSET(b, s) (((s)->_bits[_WHICH_SET(b)] & \
+ (1LL << _WHICH_BIT(b))) != 0LL)
+
+static inline int
+count_cpu(rte_cpuset_t *s)
+{
+ unsigned int _i;
+ int count = 0;
+
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++)
+ if (CPU_ISSET(_i, s) != 0LL)
+ count++;
+ return count;
+}
+#define CPU_COUNT(s) count_cpu(s)
+
+#define CPU_AND(dst, src1, src2) \
+do { \
+ unsigned int _i; \
+ \
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
+ (dst)->_bits[_i] = (src1)->_bits[_i] & (src2)->_bits[_i]; \
+} while (0)
+
+#define CPU_OR(dst, src1, src2) \
+do { \
+ unsigned int _i; \
+ \
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
+ (dst)->_bits[_i] = (src1)->_bits[_i] | (src2)->_bits[_i]; \
+} while (0)
+
+#define CPU_FILL(s) \
+do { \
+ unsigned int _i; \
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
+ (s)->_bits[_i] = -1LL; \
+} while (0)
+
+#define CPU_NOT(dst, src) \
+do { \
+ unsigned int _i; \
+ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \
+ (dst)->_bits[_i] = (src)->_bits[_i] ^ -1LL; \
+} while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SCHED_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ */
+
+#ifndef _SYS_QUEUE_H_
+#define _SYS_QUEUE_H_
+
+/*
+ * This file defines tail queues.
+ *
+ * A tail queue is headed by a pair of pointers, one to the head of the
+ * list and the other to the tail of the list. The elements are doubly
+ * linked so that an arbitrary element can be removed without a need to
+ * traverse the list. New elements can be added to the list before or
+ * after an existing element, at the head of the list, or at the end of
+ * the list. A tail queue may be traversed in either direction.
+ *
+ * Below is a summary of implemented functions where:
+ * + means the macro is available
+ * - means the macro is not available
+ * s means the macro is available but is slow (runs in O(n) time)
+ *
+ * TAILQ
+ * _HEAD +
+ * _CLASS_HEAD +
+ * _HEAD_INITIALIZER +
+ * _ENTRY +
+ * _CLASS_ENTRY +
+ * _INIT +
+ * _EMPTY +
+ * _FIRST +
+ * _NEXT +
+ * _PREV +
+ * _LAST +
+ * _LAST_FAST +
+ * _FOREACH +
+ * _FOREACH_FROM +
+ * _FOREACH_SAFE +
+ * _FOREACH_FROM_SAFE +
+ * _FOREACH_REVERSE +
+ * _FOREACH_REVERSE_FROM +
+ * _FOREACH_REVERSE_SAFE +
+ * _FOREACH_REVERSE_FROM_SAFE +
+ * _INSERT_HEAD +
+ * _INSERT_BEFORE +
+ * _INSERT_AFTER +
+ * _INSERT_TAIL +
+ * _CONCAT +
+ * _REMOVE_AFTER -
+ * _REMOVE_HEAD -
+ * _REMOVE +
+ * _SWAP +
+ *
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * List definitions.
+ */
+#define LIST_HEAD(name, type) \
+struct name { \
+ struct type *lh_first; /* first element */ \
+}
+
+#define QMD_TRACE_ELEM(elem)
+#define QMD_TRACE_HEAD(head)
+#define TRACEBUF
+#define TRACEBUF_INITIALIZER
+
+#define TRASHIT(x)
+#define QMD_IS_TRASHED(x) 0
+
+#define QMD_SAVELINK(name, link)
+
+#ifdef __cplusplus
+/*
+ * In C++ there can be structure lists and class lists:
+ */
+#define QUEUE_TYPEOF(type) type
+#else
+#define QUEUE_TYPEOF(type) struct type
+#endif
+
+/*
+ * Tail queue declarations.
+ */
+#define TAILQ_HEAD(name, type) \
+struct name { \
+ struct type *tqh_first; /* first element */ \
+ struct type **tqh_last; /* addr of last next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_CLASS_HEAD(name, type) \
+struct name { \
+ class type *tqh_first; /* first element */ \
+ class type **tqh_last; /* addr of last next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_HEAD_INITIALIZER(head) \
+ { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER }
+
+#define TAILQ_ENTRY(type) \
+struct { \
+ struct type *tqe_next; /* next element */ \
+ struct type **tqe_prev; /* address of previous next element */ \
+ TRACEBUF \
+}
+
+#define TAILQ_CLASS_ENTRY(type) \
+struct { \
+ class type *tqe_next; /* next element */ \
+ class type **tqe_prev; /* address of previous next element */ \
+ TRACEBUF \
+}
+
+/*
+ * Tail queue functions.
+ */
+#define QMD_TAILQ_CHECK_HEAD(head, field)
+#define QMD_TAILQ_CHECK_TAIL(head, headname)
+#define QMD_TAILQ_CHECK_NEXT(elm, field)
+#define QMD_TAILQ_CHECK_PREV(elm, field)
+
+#define TAILQ_CONCAT(head1, head2, field) do { \
+ if (!TAILQ_EMPTY(head2)) { \
+ *(head1)->tqh_last = (head2)->tqh_first; \
+ (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ TAILQ_INIT((head2)); \
+ QMD_TRACE_HEAD(head1); \
+ QMD_TRACE_HEAD(head2); \
+ } \
+} while (0)
+
+#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL)
+
+#define TAILQ_FIRST(head) ((head)->tqh_first)
+
+#define TAILQ_FOREACH(var, head, field) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_FROM(var, head, field) \
+ for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \
+ (var); \
+ (var) = TAILQ_NEXT((var), field))
+
+#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \
+ for ((var) = TAILQ_FIRST((head)); \
+ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \
+ (var) = (tvar))
+
+#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \
+ for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \
+ (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \
+ (var) = (tvar))
+
+#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \
+ for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \
+ (var); \
+ (var) = TAILQ_PREV((var), headname, field))
+
+#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \
+ for ((var) = TAILQ_LAST((head), headname); \
+ (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \
+ (var) = (tvar))
+
+#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \
+ for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \
+ (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \
+ (var) = (tvar))
+
+#define TAILQ_INIT(head) do { \
+ TAILQ_FIRST((head)) = NULL; \
+ (head)->tqh_last = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+} while (0)
+
+#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \
+ QMD_TAILQ_CHECK_NEXT(listelm, field); \
+ TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field); \
+ if (TAILQ_NEXT((listelm), field) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else { \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ } \
+ TAILQ_NEXT((listelm), field) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&(listelm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \
+ QMD_TAILQ_CHECK_PREV(listelm, field); \
+ (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \
+ TAILQ_NEXT((elm), field) = (listelm); \
+ *(listelm)->field.tqe_prev = (elm); \
+ (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+ QMD_TRACE_ELEM(&(listelm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_HEAD(head, elm, field) do { \
+ QMD_TAILQ_CHECK_HEAD(head, field); \
+ TAILQ_NEXT((elm), field) = TAILQ_FIRST((head)); \
+ if (TAILQ_FIRST((head)) != NULL) \
+ TAILQ_FIRST((head))->field.tqe_prev = \
+ &TAILQ_NEXT((elm), field); \
+ else \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ TAILQ_FIRST((head)) = (elm); \
+ (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_INSERT_TAIL(head, elm, field) do { \
+ QMD_TAILQ_CHECK_TAIL(head, field); \
+ TAILQ_NEXT((elm), field) = NULL; \
+ (elm)->field.tqe_prev = (head)->tqh_last; \
+ *(head)->tqh_last = (elm); \
+ (head)->tqh_last = &TAILQ_NEXT((elm), field); \
+ QMD_TRACE_HEAD(head); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_LAST(head, headname) \
+ (*(((struct headname *)((head)->tqh_last))->tqh_last))
+
+/*
+ * The FAST function is fast in that it causes no data access other
+ * then the access to the head. The standard LAST function above
+ * will cause a data access of both the element you want and
+ * the previous element. FAST is very useful for instances when
+ * you may want to prefetch the last data element.
+ */
+#define TAILQ_LAST_FAST(head, type, field) \
+ (TAILQ_EMPTY(head) ? NULL : __containerof((head)->tqh_last, \
+ QUEUE_TYPEOF(type), field.tqe_next))
+
+#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next)
+
+#define TAILQ_PREV(elm, headname, field) \
+ (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last))
+
+#define TAILQ_REMOVE(head, elm, field) do { \
+ QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \
+ QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \
+ QMD_TAILQ_CHECK_NEXT(elm, field); \
+ QMD_TAILQ_CHECK_PREV(elm, field); \
+ if ((TAILQ_NEXT((elm), field)) != NULL) \
+ TAILQ_NEXT((elm), field)->field.tqe_prev = \
+ (elm)->field.tqe_prev; \
+ else { \
+ (head)->tqh_last = (elm)->field.tqe_prev; \
+ QMD_TRACE_HEAD(head); \
+ } \
+ *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \
+ TRASHIT(*oldnext); \
+ TRASHIT(*oldprev); \
+ QMD_TRACE_ELEM(&(elm)->field); \
+} while (0)
+
+#define TAILQ_SWAP(head1, head2, type, field) do { \
+ QUEUE_TYPEOF(type) * swap_first = (head1)->tqh_first; \
+ QUEUE_TYPEOF(type) * *swap_last = (head1)->tqh_last; \
+ (head1)->tqh_first = (head2)->tqh_first; \
+ (head1)->tqh_last = (head2)->tqh_last; \
+ (head2)->tqh_first = swap_first; \
+ (head2)->tqh_last = swap_last; \
+ swap_first = (head1)->tqh_first; \
+ if (swap_first != NULL) \
+ swap_first->field.tqe_prev = &(head1)->tqh_first; \
+ else \
+ (head1)->tqh_last = &(head1)->tqh_first; \
+ swap_first = (head2)->tqh_first; \
+ if (swap_first != NULL) \
+ swap_first->field.tqe_prev = &(head2)->tqh_first; \
+ else \
+ (head2)->tqh_last = &(head2)->tqh_first; \
+} while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_QUEUE_H_ */
--- /dev/null
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2019 Intel Corporation
+ */
+
+#ifndef _UNISTD_H_
+#define _UNISTD_H_
+/**
+ * This file is added to support common code in eal_common_lcore.c
+ * as Microsoft libc does not contain unistd.h. This may be removed
+ * in future releases.
+ */
+#endif /* _UNISTD_H_ */
--- /dev/null
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2019 Intel Corporation
+
+subdir('include')
+
+sources += files(
+ 'eal.c',
+ 'eal_debug.c',
+ 'eal_lcore.c',
+ 'eal_thread.c',
+ 'getopt.c',
+)
---------------------------------------------------------------------------------------------------
SPDX Identifier TB Approval Date GB Approval Date File name
---------------------------------------------------------------------------------------------------
-1.MIT 10/23/2019 02/10/2020 lib/librte_eal/windows/eal/include/dirent.h
-2.BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/eal/include/getopt.h
+1.MIT 10/23/2019 02/10/2020 lib/librte_eal/windows/include/dirent.h
+2.BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/include/getopt.h
3.ISC AND
- BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/eal/getopt.c
+ BSD-2-Clause 10/23/2019 12/18/2019 lib/librte_eal/windows/getopt.c
4.GPL-2.0 09/25/2019 12/18/2019 buildtools/pmdinfogen/pmdinfogen.*
---------------------------------------------------------------------------------------------------
# for passing to pmdinfogen scripts
global_inc = include_directories('.', 'config',
'lib/librte_eal/include',
- 'lib/librte_eal/@0@/eal/include'.format(host_machine.system()),
+ 'lib/librte_eal/@0@/include'.format(host_machine.system()),
)
subdir('config')
endif
# include in every library to build
-EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/freebsd/eal/include
+EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/freebsd/include
EXECENV_LDFLAGS =
EXECENV_LDLIBS = -lexecinfo
endif
# include in every library to build
-EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include
+EXECENV_CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/include
EXECENV_LDLIBS =
EXECENV_ASFLAGS =