eal/linux: rename linuxapp to linux
authorBruce Richardson <bruce.richardson@intel.com>
Wed, 6 Mar 2019 16:22:38 +0000 (16:22 +0000)
committerThomas Monjalon <thomas@monjalon.net>
Tue, 12 Mar 2019 16:31:13 +0000 (17:31 +0100)
The term "linuxapp" is a legacy one, but just calling the subdirectory
"linux" is just clearer for all concerned.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
56 files changed:
MAINTAINERS
devtools/build-tags.sh
doc/guides/prog_guide/env_abstraction_layer.rst
drivers/bus/dpaa/Makefile
drivers/crypto/caam_jr/Makefile
drivers/crypto/dpaa2_sec/Makefile
drivers/crypto/dpaa_sec/Makefile
drivers/event/dpaa/Makefile
drivers/event/dpaa2/Makefile
drivers/mempool/dpaa2/Makefile
drivers/net/dpaa/Makefile
drivers/net/dpaa2/Makefile
drivers/raw/dpaa2_qdma/Makefile
kernel/linux/kni/meson.build
lib/librte_eal/Makefile
lib/librte_eal/linux/Makefile [new file with mode: 0644]
lib/librte_eal/linux/eal/Makefile [new file with mode: 0644]
lib/librte_eal/linux/eal/eal.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_alarm.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_cpuflags.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_debug.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_dev.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_hugepage_info.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_interrupts.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_lcore.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_log.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_memalloc.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_memory.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_thread.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_timer.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_vfio.c [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_vfio.h [new file with mode: 0644]
lib/librte_eal/linux/eal/eal_vfio_mp_sync.c [new file with mode: 0644]
lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h [new file with mode: 0644]
lib/librte_eal/linux/eal/meson.build [new file with mode: 0644]
lib/librte_eal/linuxapp/Makefile [deleted file]
lib/librte_eal/linuxapp/eal/Makefile [deleted file]
lib/librte_eal/linuxapp/eal/eal.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_alarm.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_cpuflags.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_debug.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_dev.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_hugepage_info.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_interrupts.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_lcore.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_log.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_memalloc.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_memory.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_thread.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_timer.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_vfio.c [deleted file]
lib/librte_eal/linuxapp/eal/eal_vfio.h [deleted file]
lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c [deleted file]
lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h [deleted file]
lib/librte_eal/linuxapp/eal/meson.build [deleted file]
lib/librte_eal/meson.build

index d1e624c..452b8eb 100644 (file)
@@ -173,7 +173,7 @@ F: lib/librte_eal/common/*malloc*
 F: lib/librte_eal/common/eal_common_fbarray.c
 F: lib/librte_eal/common/eal_common_mem*
 F: lib/librte_eal/common/eal_hugepages.h
-F: lib/librte_eal/linuxapp/eal/eal_mem*
+F: lib/librte_eal/linux/eal/eal_mem*
 F: lib/librte_eal/freebsd/eal/eal_mem*
 F: doc/guides/prog_guide/env_abstraction_layer.rst
 F: app/test/test_external_mem.c
@@ -244,8 +244,8 @@ F: lib/librte_eal/common/arch/x86/
 F: lib/librte_eal/common/include/arch/x86/
 
 Linux EAL (with overlaps)
-F: lib/librte_eal/linuxapp/Makefile
-F: lib/librte_eal/linuxapp/eal/
+F: lib/librte_eal/linux/Makefile
+F: lib/librte_eal/linux/eal/
 F: doc/guides/linux_gsg/
 
 Linux UIO
@@ -255,7 +255,7 @@ F: drivers/bus/pci/linux/*uio*
 
 Linux VFIO
 M: Anatoly Burakov <anatoly.burakov@intel.com>
-F: lib/librte_eal/linuxapp/eal/*vfio*
+F: lib/librte_eal/linux/eal/*vfio*
 F: drivers/bus/pci/linux/*vfio*
 
 FreeBSD EAL (with overlaps)
index 3a98e9b..a10a38f 100755 (executable)
@@ -67,7 +67,7 @@ common_sources()
 
 linux_sources()
 {
-       find_sources "lib/librte_eal/linuxapp" '*.[chS]'
+       find_sources "lib/librte_eal/linux" '*.[chS]'
 }
 
 bsd_sources()
index 73436b0..e1d80c0 100644 (file)
@@ -346,7 +346,7 @@ To ease the idle polling with tiny throughput, it's useful to pause the polling
 The RX interrupt is the first choice to be such kind of wake-up event, but probably won't be the only one.
 
 EAL provides the event APIs for this event-driven thread mode.
-Taking linuxapp as an example, the implementation relies on epoll. Each thread can monitor an epoll instance
+Taking Linux as an example, the implementation relies on epoll. Each thread can monitor an epoll instance
 in which all the wake-up events' file descriptors are added. The event file descriptors are created and mapped to
 the interrupt vectors according to the UIO/VFIO spec.
 From FreeBSD's perspective, kqueue is the alternative way, but not implemented yet.
index 800e5cd..248c024 100644 (file)
@@ -17,7 +17,7 @@ CFLAGS += -Wno-cast-qual
 CFLAGS += -I$(RTE_BUS_DPAA)/
 CFLAGS += -I$(RTE_BUS_DPAA)/include
 CFLAGS += -I$(RTE_BUS_DPAA)/base/qbman
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
 
 # versioning export map
index 88cdf74..b078453 100644 (file)
@@ -21,7 +21,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/crypto/caam_jr
 #sharing the hw flib headers from dpaa2_sec pmd
 CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa2_sec/
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 
 # versioning export map
 EXPORT_MAP := rte_pmd_caam_jr_version.map
index f537f76..63dbe85 100644 (file)
@@ -27,7 +27,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal
 CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2/
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 
 # versioning export map
 EXPORT_MAP := rte_pmd_dpaa2_sec_version.map
index 5ce95c2..aa214c0 100644 (file)
@@ -20,7 +20,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa_sec/
 #sharing the hw flib headers from dpaa2_sec pmd
 CFLAGS += -I$(RTE_SDK)/drivers/crypto/dpaa2_sec/
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_cryptodev
 
index 6f93e7f..9b3d6ba 100644 (file)
@@ -20,7 +20,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa
 CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa/include/
 CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include
 
 EXPORT_MAP := rte_pmd_dpaa_event_version.map
 
index e0134cc..e245682 100644 (file)
@@ -17,7 +17,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal
 CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2
 CFLAGS += -I$(RTE_SDK)/drivers/event/dpaa2
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 LDLIBS += -lrte_eal -lrte_eventdev
 LDLIBS += -lrte_bus_fslmc -lrte_mempool_dpaa2 -lrte_pmd_dpaa2
 LDLIBS += -lrte_bus_vdev
index 96c0f2b..5f3e4ea 100644 (file)
@@ -13,7 +13,7 @@ CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 
 # versioning export map
 EXPORT_MAP := rte_mempool_dpaa2_version.map
index 1c4f7d9..5b8e7f8 100644 (file)
@@ -21,7 +21,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/dpaa/base/qbman
 CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa
 CFLAGS += -I$(RTE_SDK)/drivers/event/dpaa
 CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal/include
 
 EXPORT_MAP := rte_pmd_dpaa_version.map
 
index 8bd269b..947fb98 100644 (file)
@@ -19,7 +19,7 @@ CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/mc
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/portal
 CFLAGS += -I$(RTE_SDK)/drivers/mempool/dpaa2
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 
 # versioning export map
 EXPORT_MAP := rte_pmd_dpaa2_version.map
index bdd99c9..5c75f5f 100644 (file)
@@ -12,7 +12,7 @@ CFLAGS += -DALLOW_EXPERIMENTAL_API
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
 
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linuxapp/eal
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/linux/eal
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc
 CFLAGS += -I$(RTE_SDK)/drivers/bus/fslmc/qbman/include
 
index a09af5a..877ff5c 100644 (file)
@@ -21,7 +21,7 @@ custom_target('rte_kni',
                'src=' + meson.current_source_dir(),
                'MODULE_CFLAGS=-include ' + meson.source_root() + '/config/rte_config.h' +
                ' -I' + meson.source_root() + '/lib/librte_eal/common/include' +
-               ' -I' + meson.source_root() + '/lib/librte_eal/linuxapp/eal/include' +
+               ' -I' + meson.source_root() + '/lib/librte_eal/linux/eal/include' +
                ' -I' + meson.build_root() +
                ' -I' + meson.current_source_dir() +
                ' -I' + meson.current_source_dir() + '/ethtool/ixgbe' +
index 39d64bb..c6bd39f 100644 (file)
@@ -4,8 +4,8 @@
 include $(RTE_SDK)/mk/rte.vars.mk
 
 DIRS-y += common
-DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp
-DEPDIRS-linuxapp := common
+DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linux
+DEPDIRS-linux := common
 DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += freebsd
 DEPDIRS-freebsd := common
 
diff --git a/lib/librte_eal/linux/Makefile b/lib/librte_eal/linux/Makefile
new file mode 100644 (file)
index 0000000..a0fffa9
--- /dev/null
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2014 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal
+DEPDIRS-kni := eal
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+
+include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/lib/librte_eal/linux/eal/Makefile b/lib/librte_eal/linux/eal/Makefile
new file mode 100644 (file)
index 0000000..51deb57
--- /dev/null
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2010-2016 Intel Corporation
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+LIB = librte_eal.a
+
+ARCH_DIR ?= $(RTE_ARCH)
+
+EXPORT_MAP := ../../rte_eal_version.map
+VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
+
+LIBABIVER := 9
+
+VPATH += $(RTE_SDK)/lib/librte_eal/common
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -I$(SRCDIR)/include
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
+CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
+CFLAGS += $(WERROR_FLAGS) -O3
+
+LDLIBS += -ldl
+LDLIBS += -lpthread
+LDLIBS += -lgcc_s
+LDLIBS += -lrt
+LDLIBS += -lrte_kvargs
+ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
+LDLIBS += -lnuma
+endif
+
+# specific to linuxapp exec-env
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
+
+# from common dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
+
+# from arch dir
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_hypervisor.c
+SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
+SRCS-y += rte_cycles.c
+
+CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
+
+# workaround for a gcc bug with noreturn attribute
+# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
+ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
+CFLAGS_eal_thread.o += -Wno-return-type
+endif
+
+INC := rte_kni_common.h
+
+SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
+       $(addprefix include/exec-env/,$(INC))
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_eal/linux/eal/eal.c b/lib/librte_eal/linux/eal/eal.c
new file mode 100644 (file)
index 0000000..13f4016
--- /dev/null
@@ -0,0 +1,1336 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation.
+ * Copyright(c) 2012-2014 6WIND S.A.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <syslog.h>
+#include <getopt.h>
+#include <sys/file.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <stddef.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+#if defined(RTE_ARCH_X86)
+#include <sys/io.h>
+#endif
+
+#include <rte_compat.h>
+#include <rte_common.h>
+#include <rte_debug.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_errno.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_service_component.h>
+#include <rte_log.h>
+#include <rte_random.h>
+#include <rte_cycles.h>
+#include <rte_string_fns.h>
+#include <rte_cpuflags.h>
+#include <rte_interrupts.h>
+#include <rte_bus.h>
+#include <rte_dev.h>
+#include <rte_devargs.h>
+#include <rte_version.h>
+#include <rte_atomic.h>
+#include <malloc_heap.h>
+#include <rte_vfio.h>
+#include <rte_option.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+#include "eal_options.h"
+#include "eal_vfio.h"
+
+#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
+
+#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
+
+/* Allow the application to print its usage message too if set */
+static rte_usage_hook_t        rte_application_usage_hook = NULL;
+
+/* early configuration structure, when memory config is not mmapped */
+static struct rte_mem_config early_mem_config;
+
+/* define fd variable here, because file needs to be kept open for the
+ * duration of the program, as we hold a write lock on it in the primary proc */
+static int mem_cfg_fd = -1;
+
+static struct flock wr_lock = {
+               .l_type = F_WRLCK,
+               .l_whence = SEEK_SET,
+               .l_start = offsetof(struct rte_mem_config, memsegs),
+               .l_len = sizeof(early_mem_config.memsegs),
+};
+
+/* Address of global and public configuration */
+static struct rte_config rte_config = {
+               .mem_config = &early_mem_config,
+};
+
+/* internal configuration (per-core) */
+struct lcore_config lcore_config[RTE_MAX_LCORE];
+
+/* internal configuration */
+struct internal_config internal_config;
+
+/* used by rte_rdtsc() */
+int rte_cycles_vmware_tsc_map;
+
+/* platform-specific runtime dir */
+static char runtime_dir[PATH_MAX];
+
+static const char *default_runtime_dir = "/var/run";
+
+int
+eal_create_runtime_dir(void)
+{
+       const char *directory = default_runtime_dir;
+       const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
+       const char *fallback = "/tmp";
+       char tmp[PATH_MAX];
+       int ret;
+
+       if (getuid() != 0) {
+               /* try XDG path first, fall back to /tmp */
+               if (xdg_runtime_dir != NULL)
+                       directory = xdg_runtime_dir;
+               else
+                       directory = fallback;
+       }
+       /* create DPDK subdirectory under runtime dir */
+       ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
+       if (ret < 0 || ret == sizeof(tmp)) {
+               RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
+               return -1;
+       }
+
+       /* create prefix-specific subdirectory under DPDK runtime dir */
+       ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
+                       tmp, eal_get_hugefile_prefix());
+       if (ret < 0 || ret == sizeof(runtime_dir)) {
+               RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
+               return -1;
+       }
+
+       /* create the path if it doesn't exist. no "mkdir -p" here, so do it
+        * step by step.
+        */
+       ret = mkdir(tmp, 0700);
+       if (ret < 0 && errno != EEXIST) {
+               RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+                       tmp, strerror(errno));
+               return -1;
+       }
+
+       ret = mkdir(runtime_dir, 0700);
+       if (ret < 0 && errno != EEXIST) {
+               RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
+                       runtime_dir, strerror(errno));
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+eal_clean_runtime_dir(void)
+{
+       DIR *dir;
+       struct dirent *dirent;
+       int dir_fd, fd, lck_result;
+       static const char * const filters[] = {
+               "fbarray_*",
+               "mp_socket_*"
+       };
+
+       /* open directory */
+       dir = opendir(runtime_dir);
+       if (!dir) {
+               RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n",
+                               runtime_dir);
+               goto error;
+       }
+       dir_fd = dirfd(dir);
+
+       /* lock the directory before doing anything, to avoid races */
+       if (flock(dir_fd, LOCK_EX) < 0) {
+               RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n",
+                       runtime_dir);
+               goto error;
+       }
+
+       dirent = readdir(dir);
+       if (!dirent) {
+               RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n",
+                               runtime_dir);
+               goto error;
+       }
+
+       while (dirent != NULL) {
+               unsigned int f_idx;
+               bool skip = true;
+
+               /* skip files that don't match the patterns */
+               for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) {
+                       const char *filter = filters[f_idx];
+
+                       if (fnmatch(filter, dirent->d_name, 0) == 0) {
+                               skip = false;
+                               break;
+                       }
+               }
+               if (skip) {
+                       dirent = readdir(dir);
+                       continue;
+               }
+
+               /* try and lock the file */
+               fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+
+               /* skip to next file */
+               if (fd == -1) {
+                       dirent = readdir(dir);
+                       continue;
+               }
+
+               /* non-blocking lock */
+               lck_result = flock(fd, LOCK_EX | LOCK_NB);
+
+               /* if lock succeeds, remove the file */
+               if (lck_result != -1)
+                       unlinkat(dir_fd, dirent->d_name, 0);
+               close(fd);
+               dirent = readdir(dir);
+       }
+
+       /* closedir closes dir_fd and drops the lock */
+       closedir(dir);
+       return 0;
+
+error:
+       if (dir)
+               closedir(dir);
+
+       RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n",
+               strerror(errno));
+
+       return -1;
+}
+
+const char *
+rte_eal_get_runtime_dir(void)
+{
+       return runtime_dir;
+}
+
+/* Return user provided mbuf pool ops name */
+const char *
+rte_eal_mbuf_user_pool_ops(void)
+{
+       return internal_config.user_mbuf_pool_ops_name;
+}
+
+/* Return a pointer to the configuration structure */
+struct rte_config *
+rte_eal_get_configuration(void)
+{
+       return &rte_config;
+}
+
+enum rte_iova_mode
+rte_eal_iova_mode(void)
+{
+       return rte_eal_get_configuration()->iova_mode;
+}
+
+/* parse a sysfs (or other) file containing one integer value */
+int
+eal_parse_sysfs_value(const char *filename, unsigned long *val)
+{
+       FILE *f;
+       char buf[BUFSIZ];
+       char *end = NULL;
+
+       if ((f = fopen(filename, "r")) == NULL) {
+               RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
+                       __func__, filename);
+               return -1;
+       }
+
+       if (fgets(buf, sizeof(buf), f) == NULL) {
+               RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
+                       __func__, filename);
+               fclose(f);
+               return -1;
+       }
+       *val = strtoul(buf, &end, 0);
+       if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
+               RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
+                               __func__, filename);
+               fclose(f);
+               return -1;
+       }
+       fclose(f);
+       return 0;
+}
+
+
+/* create memory configuration in shared/mmap memory. Take out
+ * a write lock on the memsegs, so we can auto-detect primary/secondary.
+ * This means we never close the file while running (auto-close on exit).
+ * We also don't lock the whole file, so that in future we can use read-locks
+ * on other parts, e.g. memzones, to detect if there are running secondary
+ * processes. */
+static void
+rte_eal_config_create(void)
+{
+       void *rte_mem_cfg_addr;
+       int retval;
+
+       const char *pathname = eal_runtime_config_path();
+
+       if (internal_config.no_shconf)
+               return;
+
+       /* map the config before hugepage address so that we don't waste a page */
+       if (internal_config.base_virtaddr != 0)
+               rte_mem_cfg_addr = (void *)
+                       RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
+                       sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
+       else
+               rte_mem_cfg_addr = NULL;
+
+       if (mem_cfg_fd < 0){
+               mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
+               if (mem_cfg_fd < 0)
+                       rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+       }
+
+       retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
+       if (retval < 0){
+               close(mem_cfg_fd);
+               rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
+       }
+
+       retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
+       if (retval < 0){
+               close(mem_cfg_fd);
+               rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
+                               "process running?\n", pathname);
+       }
+
+       rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
+                               PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
+
+       if (rte_mem_cfg_addr == MAP_FAILED){
+               rte_panic("Cannot mmap memory for rte_config\n");
+       }
+       memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
+       rte_config.mem_config = rte_mem_cfg_addr;
+
+       /* store address of the config in the config itself so that secondary
+        * processes could later map the config into this exact location */
+       rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
+
+       rte_config.mem_config->dma_maskbits = 0;
+
+}
+
+/* attach to an existing shared memory config */
+static void
+rte_eal_config_attach(void)
+{
+       struct rte_mem_config *mem_config;
+
+       const char *pathname = eal_runtime_config_path();
+
+       if (internal_config.no_shconf)
+               return;
+
+       if (mem_cfg_fd < 0){
+               mem_cfg_fd = open(pathname, O_RDWR);
+               if (mem_cfg_fd < 0)
+                       rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
+       }
+
+       /* map it as read-only first */
+       mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
+                       PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
+       if (mem_config == MAP_FAILED)
+               rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
+                         errno, strerror(errno));
+
+       rte_config.mem_config = mem_config;
+}
+
+/* reattach the shared config at exact memory location primary process has it */
+static void
+rte_eal_config_reattach(void)
+{
+       struct rte_mem_config *mem_config;
+       void *rte_mem_cfg_addr;
+
+       if (internal_config.no_shconf)
+               return;
+
+       /* save the address primary process has mapped shared config to */
+       rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
+
+       /* unmap original config */
+       munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
+
+       /* remap the config at proper address */
+       mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
+                       sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
+                       mem_cfg_fd, 0);
+       if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
+               if (mem_config != MAP_FAILED)
+                       /* errno is stale, don't use */
+                       rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
+                                 " - please use '--base-virtaddr' option\n",
+                                 rte_mem_cfg_addr, mem_config);
+               else
+                       rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
+                                 errno, strerror(errno));
+       }
+       close(mem_cfg_fd);
+
+       rte_config.mem_config = mem_config;
+}
+
+/* Detect if we are a primary or a secondary process */
+enum rte_proc_type_t
+eal_proc_type_detect(void)
+{
+       enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
+       const char *pathname = eal_runtime_config_path();
+
+       /* if there no shared config, there can be no secondary processes */
+       if (!internal_config.no_shconf) {
+               /* if we can open the file but not get a write-lock we are a
+                * secondary process. NOTE: if we get a file handle back, we
+                * keep that open and don't close it to prevent a race condition
+                * between multiple opens.
+                */
+               if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
+                               (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
+                       ptype = RTE_PROC_SECONDARY;
+       }
+
+       RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
+                       ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
+
+       return ptype;
+}
+
+/* copies data from internal config to shared config */
+static void
+eal_update_mem_config(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       mcfg->legacy_mem = internal_config.legacy_mem;
+       mcfg->single_file_segments = internal_config.single_file_segments;
+}
+
+/* copies data from shared config to internal config */
+static void
+eal_update_internal_config(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       internal_config.legacy_mem = mcfg->legacy_mem;
+       internal_config.single_file_segments = mcfg->single_file_segments;
+}
+
+/* Sets up rte_config structure with the pointer to shared memory config.*/
+static void
+rte_config_init(void)
+{
+       rte_config.process_type = internal_config.process_type;
+
+       switch (rte_config.process_type){
+       case RTE_PROC_PRIMARY:
+               rte_eal_config_create();
+               eal_update_mem_config();
+               break;
+       case RTE_PROC_SECONDARY:
+               rte_eal_config_attach();
+               rte_eal_mcfg_wait_complete(rte_config.mem_config);
+               rte_eal_config_reattach();
+               eal_update_internal_config();
+               break;
+       case RTE_PROC_AUTO:
+       case RTE_PROC_INVALID:
+               rte_panic("Invalid process type\n");
+       }
+}
+
+/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
+static void
+eal_hugedirs_unlock(void)
+{
+       int i;
+
+       for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
+       {
+               /* skip uninitialized */
+               if (internal_config.hugepage_info[i].lock_descriptor < 0)
+                       continue;
+               /* unlock hugepage file */
+               flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN);
+               close(internal_config.hugepage_info[i].lock_descriptor);
+               /* reset the field */
+               internal_config.hugepage_info[i].lock_descriptor = -1;
+       }
+}
+
+/* display usage */
+static void
+eal_usage(const char *prgname)
+{
+       printf("\nUsage: %s ", prgname);
+       eal_common_usage();
+       printf("EAL Linux options:\n"
+              "  --"OPT_SOCKET_MEM"        Memory to allocate on sockets (comma separated values)\n"
+              "  --"OPT_SOCKET_LIMIT"      Limit memory allocation on sockets (comma separated values)\n"
+              "  --"OPT_HUGE_DIR"          Directory where hugetlbfs is mounted\n"
+              "  --"OPT_FILE_PREFIX"       Prefix for hugepage filenames\n"
+              "  --"OPT_BASE_VIRTADDR"     Base virtual address\n"
+              "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
+              "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
+              "  --"OPT_LEGACY_MEM"        Legacy memory mode (no dynamic allocation, contiguous segments)\n"
+              "  --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
+              "  --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n"
+              "\n");
+       /* Allow the application to print its usage message too if hook is set */
+       if ( rte_application_usage_hook ) {
+               printf("===== Application Usage =====\n\n");
+               rte_application_usage_hook(prgname);
+       }
+}
+
+/* Set a per-application usage message */
+rte_usage_hook_t
+rte_set_application_usage_hook( rte_usage_hook_t usage_func )
+{
+       rte_usage_hook_t        old_func;
+
+       /* Will be NULL on the first call to denote the last usage routine. */
+       old_func                                        = rte_application_usage_hook;
+       rte_application_usage_hook      = usage_func;
+
+       return old_func;
+}
+
+static int
+eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg)
+{
+       char * arg[RTE_MAX_NUMA_NODES];
+       char *end;
+       int arg_num, i, len;
+       uint64_t total_mem = 0;
+
+       len = strnlen(strval, SOCKET_MEM_STRLEN);
+       if (len == SOCKET_MEM_STRLEN) {
+               RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
+               return -1;
+       }
+
+       /* all other error cases will be caught later */
+       if (!isdigit(strval[len-1]))
+               return -1;
+
+       /* split the optarg into separate socket values */
+       arg_num = rte_strsplit(strval, len,
+                       arg, RTE_MAX_NUMA_NODES, ',');
+
+       /* if split failed, or 0 arguments */
+       if (arg_num <= 0)
+               return -1;
+
+       /* parse each defined socket option */
+       errno = 0;
+       for (i = 0; i < arg_num; i++) {
+               uint64_t val;
+               end = NULL;
+               val = strtoull(arg[i], &end, 10);
+
+               /* check for invalid input */
+               if ((errno != 0)  ||
+                               (arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
+                       return -1;
+               val <<= 20;
+               total_mem += val;
+               socket_arg[i] = val;
+       }
+
+       return 0;
+}
+
+static int
+eal_parse_base_virtaddr(const char *arg)
+{
+       char *end;
+       uint64_t addr;
+
+       errno = 0;
+       addr = strtoull(arg, &end, 16);
+
+       /* check for errors */
+       if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0'))
+               return -1;
+
+       /* make sure we don't exceed 32-bit boundary on 32-bit target */
+#ifndef RTE_ARCH_64
+       if (addr >= UINTPTR_MAX)
+               return -1;
+#endif
+
+       /* align the addr on 16M boundary, 16MB is the minimum huge page
+        * size on IBM Power architecture. If the addr is aligned to 16MB,
+        * it can align to 2MB for x86. So this alignment can also be used
+        * on x86 */
+       internal_config.base_virtaddr =
+               RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M);
+
+       return 0;
+}
+
+static int
+eal_parse_vfio_intr(const char *mode)
+{
+       unsigned i;
+       static struct {
+               const char *name;
+               enum rte_intr_mode value;
+       } map[] = {
+               { "legacy", RTE_INTR_MODE_LEGACY },
+               { "msi", RTE_INTR_MODE_MSI },
+               { "msix", RTE_INTR_MODE_MSIX },
+       };
+
+       for (i = 0; i < RTE_DIM(map); i++) {
+               if (!strcmp(mode, map[i].name)) {
+                       internal_config.vfio_intr_mode = map[i].value;
+                       return 0;
+               }
+       }
+       return -1;
+}
+
+/* Parse the arguments for --log-level only */
+static void
+eal_log_level_parse(int argc, char **argv)
+{
+       int opt;
+       char **argvopt;
+       int option_index;
+       const int old_optind = optind;
+       const int old_optopt = optopt;
+       char * const old_optarg = optarg;
+
+       argvopt = argv;
+       optind = 1;
+
+       while ((opt = getopt_long(argc, argvopt, eal_short_options,
+                                 eal_long_options, &option_index)) != EOF) {
+
+               int ret;
+
+               /* getopt is not happy, stop right now */
+               if (opt == '?')
+                       break;
+
+               ret = (opt == OPT_LOG_LEVEL_NUM) ?
+                       eal_parse_common_option(opt, optarg, &internal_config) : 0;
+
+               /* common parser is not happy */
+               if (ret < 0)
+                       break;
+       }
+
+       /* restore getopt lib */
+       optind = old_optind;
+       optopt = old_optopt;
+       optarg = old_optarg;
+}
+
+/* Parse the argument given in the command line of the application */
+static int
+eal_parse_args(int argc, char **argv)
+{
+       int opt, ret;
+       char **argvopt;
+       int option_index;
+       char *prgname = argv[0];
+       const int old_optind = optind;
+       const int old_optopt = optopt;
+       char * const old_optarg = optarg;
+
+       argvopt = argv;
+       optind = 1;
+       opterr = 0;
+
+       while ((opt = getopt_long(argc, argvopt, eal_short_options,
+                                 eal_long_options, &option_index)) != EOF) {
+
+               /*
+                * getopt didn't recognise the option, lets parse the
+                * registered options to see if the flag is valid
+                */
+               if (opt == '?') {
+                       ret = rte_option_parse(argv[optind-1]);
+                       if (ret == 0)
+                               continue;
+
+                       eal_usage(prgname);
+                       ret = -1;
+                       goto out;
+               }
+
+               ret = eal_parse_common_option(opt, optarg, &internal_config);
+               /* common parser is not happy */
+               if (ret < 0) {
+                       eal_usage(prgname);
+                       ret = -1;
+                       goto out;
+               }
+               /* common parser handled this option */
+               if (ret == 0)
+                       continue;
+
+               switch (opt) {
+               case 'h':
+                       eal_usage(prgname);
+                       exit(EXIT_SUCCESS);
+
+               case OPT_HUGE_DIR_NUM:
+               {
+                       char *hdir = strdup(optarg);
+                       if (hdir == NULL)
+                               RTE_LOG(ERR, EAL, "Could not store hugepage directory\n");
+                       else {
+                               /* free old hugepage dir */
+                               if (internal_config.hugepage_dir != NULL)
+                                       free(internal_config.hugepage_dir);
+                               internal_config.hugepage_dir = hdir;
+                       }
+                       break;
+               }
+               case OPT_FILE_PREFIX_NUM:
+               {
+                       char *prefix = strdup(optarg);
+                       if (prefix == NULL)
+                               RTE_LOG(ERR, EAL, "Could not store file prefix\n");
+                       else {
+                               /* free old prefix */
+                               if (internal_config.hugefile_prefix != NULL)
+                                       free(internal_config.hugefile_prefix);
+                               internal_config.hugefile_prefix = prefix;
+                       }
+                       break;
+               }
+               case OPT_SOCKET_MEM_NUM:
+                       if (eal_parse_socket_arg(optarg,
+                                       internal_config.socket_mem) < 0) {
+                               RTE_LOG(ERR, EAL, "invalid parameters for --"
+                                               OPT_SOCKET_MEM "\n");
+                               eal_usage(prgname);
+                               ret = -1;
+                               goto out;
+                       }
+                       internal_config.force_sockets = 1;
+                       break;
+
+               case OPT_SOCKET_LIMIT_NUM:
+                       if (eal_parse_socket_arg(optarg,
+                                       internal_config.socket_limit) < 0) {
+                               RTE_LOG(ERR, EAL, "invalid parameters for --"
+                                               OPT_SOCKET_LIMIT "\n");
+                               eal_usage(prgname);
+                               ret = -1;
+                               goto out;
+                       }
+                       internal_config.force_socket_limits = 1;
+                       break;
+
+               case OPT_BASE_VIRTADDR_NUM:
+                       if (eal_parse_base_virtaddr(optarg) < 0) {
+                               RTE_LOG(ERR, EAL, "invalid parameter for --"
+                                               OPT_BASE_VIRTADDR "\n");
+                               eal_usage(prgname);
+                               ret = -1;
+                               goto out;
+                       }
+                       break;
+
+               case OPT_VFIO_INTR_NUM:
+                       if (eal_parse_vfio_intr(optarg) < 0) {
+                               RTE_LOG(ERR, EAL, "invalid parameters for --"
+                                               OPT_VFIO_INTR "\n");
+                               eal_usage(prgname);
+                               ret = -1;
+                               goto out;
+                       }
+                       break;
+
+               case OPT_CREATE_UIO_DEV_NUM:
+                       internal_config.create_uio_dev = 1;
+                       break;
+
+               case OPT_MBUF_POOL_OPS_NAME_NUM:
+               {
+                       char *ops_name = strdup(optarg);
+                       if (ops_name == NULL)
+                               RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
+                       else {
+                               /* free old ops name */
+                               if (internal_config.user_mbuf_pool_ops_name !=
+                                               NULL)
+                                       free(internal_config.user_mbuf_pool_ops_name);
+
+                               internal_config.user_mbuf_pool_ops_name =
+                                               ops_name;
+                       }
+                       break;
+               }
+               case OPT_MATCH_ALLOCATIONS_NUM:
+                       internal_config.match_allocations = 1;
+                       break;
+
+               default:
+                       if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
+                               RTE_LOG(ERR, EAL, "Option %c is not supported "
+                                       "on Linux\n", opt);
+                       } else if (opt >= OPT_LONG_MIN_NUM &&
+                                  opt < OPT_LONG_MAX_NUM) {
+                               RTE_LOG(ERR, EAL, "Option %s is not supported "
+                                       "on Linux\n",
+                                       eal_long_options[option_index].name);
+                       } else {
+                               RTE_LOG(ERR, EAL, "Option %d is not supported "
+                                       "on Linux\n", opt);
+                       }
+                       eal_usage(prgname);
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+       /* create runtime data directory */
+       if (internal_config.no_shconf == 0 &&
+                       eal_create_runtime_dir() < 0) {
+               RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
+               ret = -1;
+               goto out;
+       }
+
+       if (eal_adjust_config(&internal_config) != 0) {
+               ret = -1;
+               goto out;
+       }
+
+       /* sanity checks */
+       if (eal_check_common_options(&internal_config) != 0) {
+               eal_usage(prgname);
+               ret = -1;
+               goto out;
+       }
+
+       if (optind >= 0)
+               argv[optind-1] = prgname;
+       ret = optind-1;
+
+out:
+       /* restore getopt lib */
+       optind = old_optind;
+       optopt = old_optopt;
+       optarg = old_optarg;
+
+       return ret;
+}
+
+static int
+check_socket(const struct rte_memseg_list *msl, void *arg)
+{
+       int *socket_id = arg;
+
+       if (msl->external)
+               return 0;
+
+       return *socket_id == msl->socket_id;
+}
+
+static void
+eal_check_mem_on_local_socket(void)
+{
+       int socket_id;
+
+       socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
+
+       if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
+               RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
+}
+
+static int
+sync_func(__attribute__((unused)) void *arg)
+{
+       return 0;
+}
+
+inline static void
+rte_eal_mcfg_complete(void)
+{
+       /* ALL shared mem_config related INIT DONE */
+       if (rte_config.process_type == RTE_PROC_PRIMARY)
+               rte_config.mem_config->magic = RTE_MAGIC;
+
+       internal_config.init_complete = 1;
+}
+
+/*
+ * Request iopl privilege for all RPL, returns 0 on success
+ * iopl() call is mostly for the i386 architecture. For other architectures,
+ * return -1 to indicate IO privilege can't be changed in this way.
+ */
+int
+rte_eal_iopl_init(void)
+{
+#if defined(RTE_ARCH_X86)
+       if (iopl(3) != 0)
+               return -1;
+#endif
+       return 0;
+}
+
+#ifdef VFIO_PRESENT
+static int rte_eal_vfio_setup(void)
+{
+       if (rte_vfio_enable("vfio"))
+               return -1;
+
+       return 0;
+}
+#endif
+
+static void rte_eal_init_alert(const char *msg)
+{
+       fprintf(stderr, "EAL: FATAL: %s\n", msg);
+       RTE_LOG(ERR, EAL, "%s\n", msg);
+}
+
+/* Launch threads, called at application init(). */
+int
+rte_eal_init(int argc, char **argv)
+{
+       int i, fctret, ret;
+       pthread_t thread_id;
+       static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
+       const char *p;
+       static char logid[PATH_MAX];
+       char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+       char thread_name[RTE_MAX_THREAD_NAME_LEN];
+
+       /* checks if the machine is adequate */
+       if (!rte_cpu_is_supported()) {
+               rte_eal_init_alert("unsupported cpu type.");
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       if (!rte_atomic32_test_and_set(&run_once)) {
+               rte_eal_init_alert("already called initialization.");
+               rte_errno = EALREADY;
+               return -1;
+       }
+
+       p = strrchr(argv[0], '/');
+       strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
+       thread_id = pthread_self();
+
+       eal_reset_internal_config(&internal_config);
+
+       /* set log level as early as possible */
+       eal_log_level_parse(argc, argv);
+
+       if (rte_eal_cpu_init() < 0) {
+               rte_eal_init_alert("Cannot detect lcores.");
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       fctret = eal_parse_args(argc, argv);
+       if (fctret < 0) {
+               rte_eal_init_alert("Invalid 'command line' arguments.");
+               rte_errno = EINVAL;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+
+       if (eal_plugins_init() < 0) {
+               rte_eal_init_alert("Cannot init plugins");
+               rte_errno = EINVAL;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+
+       if (eal_option_device_parse()) {
+               rte_errno = ENODEV;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+
+       rte_config_init();
+
+       if (rte_eal_intr_init() < 0) {
+               rte_eal_init_alert("Cannot init interrupt-handling thread");
+               return -1;
+       }
+
+       /* Put mp channel init before bus scan so that we can init the vdev
+        * bus through mp channel in the secondary process before the bus scan.
+        */
+       if (rte_mp_channel_init() < 0) {
+               rte_eal_init_alert("failed to init mp channel");
+               if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+                       rte_errno = EFAULT;
+                       return -1;
+               }
+       }
+
+       /* register multi-process action callbacks for hotplug */
+       if (rte_mp_dev_hotplug_init() < 0) {
+               rte_eal_init_alert("failed to register mp callback for hotplug");
+               return -1;
+       }
+
+       if (rte_bus_scan()) {
+               rte_eal_init_alert("Cannot scan the buses for devices");
+               rte_errno = ENODEV;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+
+       /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
+       if (internal_config.iova_mode == RTE_IOVA_DC) {
+               /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
+               rte_eal_get_configuration()->iova_mode =
+                       rte_bus_get_iommu_class();
+
+               /* Workaround for KNI which requires physical address to work */
+               if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
+                               rte_eal_check_module("rte_kni") == 1) {
+                       rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
+                       RTE_LOG(WARNING, EAL,
+                               "Some devices want IOVA as VA but PA will be used because.. "
+                               "KNI module inserted\n");
+               }
+       } else {
+               rte_eal_get_configuration()->iova_mode =
+                       internal_config.iova_mode;
+       }
+
+       if (internal_config.no_hugetlbfs == 0) {
+               /* rte_config isn't initialized yet */
+               ret = internal_config.process_type == RTE_PROC_PRIMARY ?
+                               eal_hugepage_info_init() :
+                               eal_hugepage_info_read();
+               if (ret < 0) {
+                       rte_eal_init_alert("Cannot get hugepage information.");
+                       rte_errno = EACCES;
+                       rte_atomic32_clear(&run_once);
+                       return -1;
+               }
+       }
+
+       if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
+               if (internal_config.no_hugetlbfs)
+                       internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
+       }
+
+       if (internal_config.vmware_tsc_map == 1) {
+#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
+               rte_cycles_vmware_tsc_map = 1;
+               RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
+                               "you must have monitor_control.pseudo_perfctr = TRUE\n");
+#else
+               RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
+                               "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
+#endif
+       }
+
+       rte_srand(rte_rdtsc());
+
+       if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
+               rte_eal_init_alert("Cannot init logging.");
+               rte_errno = ENOMEM;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+
+#ifdef VFIO_PRESENT
+       if (rte_eal_vfio_setup() < 0) {
+               rte_eal_init_alert("Cannot init VFIO");
+               rte_errno = EAGAIN;
+               rte_atomic32_clear(&run_once);
+               return -1;
+       }
+#endif
+       /* in secondary processes, memory init may allocate additional fbarrays
+        * not present in primary processes, so to avoid any potential issues,
+        * initialize memzones first.
+        */
+       if (rte_eal_memzone_init() < 0) {
+               rte_eal_init_alert("Cannot init memzone");
+               rte_errno = ENODEV;
+               return -1;
+       }
+
+       if (rte_eal_memory_init() < 0) {
+               rte_eal_init_alert("Cannot init memory");
+               rte_errno = ENOMEM;
+               return -1;
+       }
+
+       /* the directories are locked during eal_hugepage_info_init */
+       eal_hugedirs_unlock();
+
+       if (rte_eal_malloc_heap_init() < 0) {
+               rte_eal_init_alert("Cannot init malloc heap");
+               rte_errno = ENODEV;
+               return -1;
+       }
+
+       if (rte_eal_tailqs_init() < 0) {
+               rte_eal_init_alert("Cannot init tail queues for objects");
+               rte_errno = EFAULT;
+               return -1;
+       }
+
+       if (rte_eal_alarm_init() < 0) {
+               rte_eal_init_alert("Cannot init interrupt-handling thread");
+               /* rte_eal_alarm_init sets rte_errno on failure. */
+               return -1;
+       }
+
+       if (rte_eal_timer_init() < 0) {
+               rte_eal_init_alert("Cannot init HPET or TSC timers");
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       eal_check_mem_on_local_socket();
+
+       eal_thread_init_master(rte_config.master_lcore);
+
+       ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+       RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+               rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
+               ret == 0 ? "" : "...");
+
+       RTE_LCORE_FOREACH_SLAVE(i) {
+
+               /*
+                * create communication pipes between master thread
+                * and children
+                */
+               if (pipe(lcore_config[i].pipe_master2slave) < 0)
+                       rte_panic("Cannot create pipe\n");
+               if (pipe(lcore_config[i].pipe_slave2master) < 0)
+                       rte_panic("Cannot create pipe\n");
+
+               lcore_config[i].state = WAIT;
+
+               /* create a thread for each lcore */
+               ret = pthread_create(&lcore_config[i].thread_id, NULL,
+                                    eal_thread_loop, NULL);
+               if (ret != 0)
+                       rte_panic("Cannot create thread\n");
+
+               /* Set thread_name for aid in debugging. */
+               snprintf(thread_name, sizeof(thread_name),
+                       "lcore-slave-%d", i);
+               ret = rte_thread_setname(lcore_config[i].thread_id,
+                                               thread_name);
+               if (ret != 0)
+                       RTE_LOG(DEBUG, EAL,
+                               "Cannot set name for lcore thread\n");
+       }
+
+       /*
+        * Launch a dummy function on all slave lcores, so that master lcore
+        * knows they are all ready when this function returns.
+        */
+       rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
+       rte_eal_mp_wait_lcore();
+
+       /* initialize services so vdevs register service during bus_probe. */
+       ret = rte_service_init();
+       if (ret) {
+               rte_eal_init_alert("rte_service_init() failed");
+               rte_errno = ENOEXEC;
+               return -1;
+       }
+
+       /* Probe all the buses and devices/drivers on them */
+       if (rte_bus_probe()) {
+               rte_eal_init_alert("Cannot probe devices");
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+#ifdef VFIO_PRESENT
+       /* Register mp action after probe() so that we got enough info */
+       if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
+               return -1;
+#endif
+
+       /* initialize default service/lcore mappings and start running. Ignore
+        * -ENOTSUP, as it indicates no service coremask passed to EAL.
+        */
+       ret = rte_service_start_with_defaults();
+       if (ret < 0 && ret != -ENOTSUP) {
+               rte_errno = ENOEXEC;
+               return -1;
+       }
+
+       /*
+        * Clean up unused files in runtime directory. We do this at the end of
+        * init and not at the beginning because we want to clean stuff up
+        * whether we are primary or secondary process, but we cannot remove
+        * primary process' files because secondary should be able to run even
+        * if primary process is dead.
+        *
+        * In no_shconf mode, no runtime directory is created in the first
+        * place, so no cleanup needed.
+        */
+       if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
+               rte_eal_init_alert("Cannot clear runtime directory\n");
+               return -1;
+       }
+
+       rte_eal_mcfg_complete();
+
+       /* Call each registered callback, if enabled */
+       rte_option_init();
+
+       return fctret;
+}
+
+static int
+mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+               void *arg __rte_unused)
+{
+       /* ms is const, so find this memseg */
+       struct rte_memseg *found;
+
+       if (msl->external)
+               return 0;
+
+       found = rte_mem_virt2memseg(ms->addr, msl);
+
+       found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
+
+       return 0;
+}
+
+int __rte_experimental
+rte_eal_cleanup(void)
+{
+       /* if we're in a primary process, we need to mark hugepages as freeable
+        * so that finalization can release them back to the system.
+        */
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+               rte_memseg_walk(mark_freeable, NULL);
+       rte_service_finalize();
+       rte_mp_channel_cleanup();
+       eal_cleanup_config(&internal_config);
+       return 0;
+}
+
+/* get core role */
+enum rte_lcore_role_t
+rte_eal_lcore_role(unsigned lcore_id)
+{
+       return rte_config.lcore_role[lcore_id];
+}
+
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+       return rte_config.process_type;
+}
+
+int rte_eal_has_hugepages(void)
+{
+       return ! internal_config.no_hugetlbfs;
+}
+
+int rte_eal_has_pci(void)
+{
+       return !internal_config.no_pci;
+}
+
+int rte_eal_create_uio_dev(void)
+{
+       return internal_config.create_uio_dev;
+}
+
+enum rte_intr_mode
+rte_eal_vfio_intr_mode(void)
+{
+       return internal_config.vfio_intr_mode;
+}
+
+int
+rte_eal_check_module(const char *module_name)
+{
+       char sysfs_mod_name[PATH_MAX];
+       struct stat st;
+       int n;
+
+       if (NULL == module_name)
+               return -1;
+
+       /* Check if there is sysfs mounted */
+       if (stat("/sys/module", &st) != 0) {
+               RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n",
+                       errno, strerror(errno));
+               return -1;
+       }
+
+       /* A module might be built-in, therefore try sysfs */
+       n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name);
+       if (n < 0 || n > PATH_MAX) {
+               RTE_LOG(DEBUG, EAL, "Could not format module path\n");
+               return -1;
+       }
+
+       if (stat(sysfs_mod_name, &st) != 0) {
+               RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n",
+                       sysfs_mod_name, errno, strerror(errno));
+               return 0;
+       }
+
+       /* Module has been found */
+       return 1;
+}
diff --git a/lib/librte_eal/linux/eal/eal_alarm.c b/lib/librte_eal/linux/eal/eal_alarm.c
new file mode 100644 (file)
index 0000000..840ede7
--- /dev/null
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/queue.h>
+#include <sys/time.h>
+#include <sys/timerfd.h>
+
+#include <rte_memory.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_common.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_lcore.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <eal_private.h>
+
+#ifndef        TFD_NONBLOCK
+#include <fcntl.h>
+#define        TFD_NONBLOCK    O_NONBLOCK
+#endif
+
+#define NS_PER_US 1000
+#define US_PER_MS 1000
+#define MS_PER_S 1000
+#ifndef US_PER_S
+#define US_PER_S (US_PER_MS * MS_PER_S)
+#endif
+
+#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
+#else
+#define CLOCK_TYPE_ID CLOCK_MONOTONIC
+#endif
+
+struct alarm_entry {
+       LIST_ENTRY(alarm_entry) next;
+       struct timeval time;
+       rte_eal_alarm_callback cb_fn;
+       void *cb_arg;
+       volatile uint8_t executing;
+       volatile pthread_t executing_id;
+};
+
+static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
+static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static int handler_registered = 0;
+static void eal_alarm_callback(void *arg);
+
+int
+rte_eal_alarm_init(void)
+{
+       intr_handle.type = RTE_INTR_HANDLE_ALARM;
+       /* create a timerfd file descriptor */
+       intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+       if (intr_handle.fd == -1)
+               goto error;
+
+       return 0;
+
+error:
+       rte_errno = errno;
+       return -1;
+}
+
+static void
+eal_alarm_callback(void *arg __rte_unused)
+{
+       struct timespec now;
+       struct alarm_entry *ap;
+
+       rte_spinlock_lock(&alarm_list_lk);
+       while ((ap = LIST_FIRST(&alarm_list)) !=NULL &&
+                       clock_gettime(CLOCK_TYPE_ID, &now) == 0 &&
+                       (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec &&
+                                               (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) {
+               ap->executing = 1;
+               ap->executing_id = pthread_self();
+               rte_spinlock_unlock(&alarm_list_lk);
+
+               ap->cb_fn(ap->cb_arg);
+
+               rte_spinlock_lock(&alarm_list_lk);
+
+               LIST_REMOVE(ap, next);
+               free(ap);
+       }
+
+       if (!LIST_EMPTY(&alarm_list)) {
+               struct itimerspec atime = { .it_interval = { 0, 0 } };
+
+               ap = LIST_FIRST(&alarm_list);
+               atime.it_value.tv_sec = ap->time.tv_sec;
+               atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US;
+               /* perform borrow for subtraction if necessary */
+               if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US))
+                       atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US;
+
+               atime.it_value.tv_sec -= now.tv_sec;
+               atime.it_value.tv_nsec -= now.tv_nsec;
+               timerfd_settime(intr_handle.fd, 0, &atime, NULL);
+       }
+       rte_spinlock_unlock(&alarm_list_lk);
+}
+
+int
+rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+       struct timespec now;
+       int ret = 0;
+       struct alarm_entry *ap, *new_alarm;
+
+       /* Check parameters, including that us won't cause a uint64_t overflow */
+       if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
+               return -EINVAL;
+
+       new_alarm = calloc(1, sizeof(*new_alarm));
+       if (new_alarm == NULL)
+               return -ENOMEM;
+
+       /* use current time to calculate absolute time of alarm */
+       clock_gettime(CLOCK_TYPE_ID, &now);
+
+       new_alarm->cb_fn = cb_fn;
+       new_alarm->cb_arg = cb_arg;
+       new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S;
+       new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S);
+
+       rte_spinlock_lock(&alarm_list_lk);
+       if (!handler_registered) {
+               ret |= rte_intr_callback_register(&intr_handle,
+                               eal_alarm_callback, NULL);
+               handler_registered = (ret == 0) ? 1 : 0;
+       }
+
+       if (LIST_EMPTY(&alarm_list))
+               LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
+       else {
+               LIST_FOREACH(ap, &alarm_list, next) {
+                       if (ap->time.tv_sec > new_alarm->time.tv_sec ||
+                                       (ap->time.tv_sec == new_alarm->time.tv_sec &&
+                                                       ap->time.tv_usec > new_alarm->time.tv_usec)){
+                               LIST_INSERT_BEFORE(ap, new_alarm, next);
+                               break;
+                       }
+                       if (LIST_NEXT(ap, next) == NULL) {
+                               LIST_INSERT_AFTER(ap, new_alarm, next);
+                               break;
+                       }
+               }
+       }
+
+       if (LIST_FIRST(&alarm_list) == new_alarm) {
+               struct itimerspec alarm_time = {
+                       .it_interval = {0, 0},
+                       .it_value = {
+                               .tv_sec = us / US_PER_S,
+                               .tv_nsec = (us % US_PER_S) * NS_PER_US,
+                       },
+               };
+               ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL);
+       }
+       rte_spinlock_unlock(&alarm_list_lk);
+
+       return ret;
+}
+
+int
+rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
+{
+       struct alarm_entry *ap, *ap_prev;
+       int count = 0;
+       int err = 0;
+       int executing;
+
+       if (!cb_fn) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       do {
+               executing = 0;
+               rte_spinlock_lock(&alarm_list_lk);
+               /* remove any matches at the start of the list */
+               while ((ap = LIST_FIRST(&alarm_list)) != NULL &&
+                               cb_fn == ap->cb_fn &&
+                               (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+                       if (ap->executing == 0) {
+                               LIST_REMOVE(ap, next);
+                               free(ap);
+                               count++;
+                       } else {
+                               /* If calling from other context, mark that alarm is executing
+                                * so loop can spin till it finish. Otherwise we are trying to
+                                * cancel our self - mark it by EINPROGRESS */
+                               if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+                                       executing++;
+                               else
+                                       err = EINPROGRESS;
+
+                               break;
+                       }
+               }
+               ap_prev = ap;
+
+               /* now go through list, removing entries not at start */
+               LIST_FOREACH(ap, &alarm_list, next) {
+                       /* this won't be true first time through */
+                       if (cb_fn == ap->cb_fn &&
+                                       (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
+
+                               if (ap->executing == 0) {
+                                       LIST_REMOVE(ap, next);
+                                       free(ap);
+                                       count++;
+                                       ap = ap_prev;
+                               } else if (pthread_equal(ap->executing_id, pthread_self()) == 0)
+                                       executing++;
+                               else
+                                       err = EINPROGRESS;
+                       }
+                       ap_prev = ap;
+               }
+               rte_spinlock_unlock(&alarm_list_lk);
+       } while (executing != 0);
+
+       if (count == 0 && err == 0)
+               rte_errno = ENOENT;
+       else if (err)
+               rte_errno = err;
+
+       return count;
+}
diff --git a/lib/librte_eal/linux/eal/eal_cpuflags.c b/lib/librte_eal/linux/eal/eal_cpuflags.c
new file mode 100644 (file)
index 0000000..d38296e
--- /dev/null
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2018 Red Hat, Inc.
+ */
+
+#include <elf.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 16)
+#include <sys/auxv.h>
+#define HAS_AUXV 1
+#endif
+#endif
+
+#include <rte_cpuflags.h>
+
+#ifndef HAS_AUXV
+static unsigned long
+getauxval(unsigned long type __rte_unused)
+{
+       errno = ENOTSUP;
+       return 0;
+}
+#endif
+
+#ifdef RTE_ARCH_64
+typedef Elf64_auxv_t Internal_Elfx_auxv_t;
+#else
+typedef Elf32_auxv_t Internal_Elfx_auxv_t;
+#endif
+
+/**
+ * Provides a method for retrieving values from the auxiliary vector and
+ * possibly running a string comparison.
+ *
+ * @return Always returns a result.  When the result is 0, check errno
+ * to see if an error occurred during processing.
+ */
+static unsigned long
+_rte_cpu_getauxval(unsigned long type, const char *str)
+{
+       unsigned long val;
+
+       errno = 0;
+       val = getauxval(type);
+
+       if (!val && (errno == ENOTSUP || errno == ENOENT)) {
+               int auxv_fd = open("/proc/self/auxv", O_RDONLY);
+               Internal_Elfx_auxv_t auxv;
+
+               if (auxv_fd == -1)
+                       return 0;
+
+               errno = ENOENT;
+               while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
+                       if (auxv.a_type == type) {
+                               errno = 0;
+                               val = auxv.a_un.a_val;
+                               if (str)
+                                       val = strcmp((const char *)val, str);
+                               break;
+                       }
+               }
+               close(auxv_fd);
+       }
+
+       return val;
+}
+
+unsigned long
+rte_cpu_getauxval(unsigned long type)
+{
+       return _rte_cpu_getauxval(type, NULL);
+}
+
+int
+rte_cpu_strcmp_auxval(unsigned long type, const char *str)
+{
+       return _rte_cpu_getauxval(type, str);
+}
diff --git a/lib/librte_eal/linux/eal/eal_debug.c b/lib/librte_eal/linux/eal/eal_debug.c
new file mode 100644 (file)
index 0000000..5d92500
--- /dev/null
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifdef RTE_BACKTRACE
+#include <execinfo.h>
+#endif
+#include <stdarg.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_debug.h>
+#include <rte_common.h>
+#include <rte_eal.h>
+
+#define BACKTRACE_SIZE 256
+
+/* dump the stack of the calling core */
+void rte_dump_stack(void)
+{
+#ifdef RTE_BACKTRACE
+       void *func[BACKTRACE_SIZE];
+       char **symb = NULL;
+       int size;
+
+       size = backtrace(func, BACKTRACE_SIZE);
+       symb = backtrace_symbols(func, size);
+
+       if (symb == NULL)
+               return;
+
+       while (size > 0) {
+               rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
+                       "%d: [%s]\n", size, symb[size - 1]);
+               size --;
+       }
+
+       free(symb);
+#endif /* RTE_BACKTRACE */
+}
+
+/* not implemented in this environment */
+void rte_dump_registers(void)
+{
+       return;
+}
+
+/* call abort(), it will generate a coredump if enabled */
+void __rte_panic(const char *funcname, const char *format, ...)
+{
+       va_list ap;
+
+       rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
+       va_start(ap, format);
+       rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+       va_end(ap);
+       rte_dump_stack();
+       rte_dump_registers();
+       abort();
+}
+
+/*
+ * Like rte_panic this terminates the application. However, no traceback is
+ * provided and no core-dump is generated.
+ */
+void
+rte_exit(int exit_code, const char *format, ...)
+{
+       va_list ap;
+
+       if (exit_code != 0)
+               RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
+                               "  Cause: ", exit_code);
+
+       va_start(ap, format);
+       rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
+       va_end(ap);
+
+#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
+       if (rte_eal_cleanup() != 0)
+               RTE_LOG(CRIT, EAL,
+                       "EAL could not release all resources\n");
+       exit(exit_code);
+#else
+       rte_dump_stack();
+       rte_dump_registers();
+       abort();
+#endif
+}
diff --git a/lib/librte_eal/linux/eal/eal_dev.c b/lib/librte_eal/linux/eal/eal_dev.c
new file mode 100644 (file)
index 0000000..2830c86
--- /dev/null
@@ -0,0 +1,396 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+
+#include <rte_string_fns.h>
+#include <rte_log.h>
+#include <rte_compat.h>
+#include <rte_dev.h>
+#include <rte_malloc.h>
+#include <rte_interrupts.h>
+#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
+#include <rte_errno.h>
+
+#include "eal_private.h"
+
+static struct rte_intr_handle intr_handle = {.fd = -1 };
+static bool monitor_started;
+static bool hotplug_handle;
+
+#define EAL_UEV_MSG_LEN 4096
+#define EAL_UEV_MSG_ELEM_LEN 128
+
+/*
+ * spinlock for device hot-unplug failure handling. If it try to access bus or
+ * device, such as handle sigbus on bus or handle memory failure for device
+ * just need to use this lock. It could protect the bus and the device to avoid
+ * race condition.
+ */
+static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
+static void dev_uev_handler(__rte_unused void *param);
+
+/* identify the system layer which reports this event. */
+enum eal_dev_event_subsystem {
+       EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */
+       EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */
+       EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */
+       EAL_DEV_EVENT_SUBSYSTEM_MAX
+};
+
+static void
+sigbus_action_recover(void)
+{
+       if (sigbus_need_recover) {
+               sigaction(SIGBUS, &sigbus_action_old, NULL);
+               sigbus_need_recover = 0;
+       }
+}
+
+static void sigbus_handler(int signum, siginfo_t *info,
+                               void *ctx __rte_unused)
+{
+       int ret;
+
+       RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+               (int)pthread_self(), info->si_addr);
+
+       rte_spinlock_lock(&failure_handle_lock);
+       ret = rte_bus_sigbus_handler(info->si_addr);
+       rte_spinlock_unlock(&failure_handle_lock);
+       if (ret == -1) {
+               rte_exit(EXIT_FAILURE,
+                        "Failed to handle SIGBUS for hot-unplug, "
+                        "(rte_errno: %s)!", strerror(rte_errno));
+       } else if (ret == 1) {
+               if (sigbus_action_old.sa_flags == SA_SIGINFO
+                   && sigbus_action_old.sa_sigaction) {
+                       (*(sigbus_action_old.sa_sigaction))(signum,
+                                                           info, ctx);
+               } else if (sigbus_action_old.sa_flags != SA_SIGINFO
+                          && sigbus_action_old.sa_handler) {
+                       (*(sigbus_action_old.sa_handler))(signum);
+               } else {
+                       rte_exit(EXIT_FAILURE,
+                                "Failed to handle generic SIGBUS!");
+               }
+       }
+
+       RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
+}
+
+static int cmp_dev_name(const struct rte_device *dev,
+       const void *_name)
+{
+       const char *name = _name;
+
+       return strcmp(dev->name, name);
+}
+
+static int
+dev_uev_socket_fd_create(void)
+{
+       struct sockaddr_nl addr;
+       int ret;
+
+       intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
+                       SOCK_NONBLOCK,
+                       NETLINK_KOBJECT_UEVENT);
+       if (intr_handle.fd < 0) {
+               RTE_LOG(ERR, EAL, "create uevent fd failed.\n");
+               return -1;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.nl_family = AF_NETLINK;
+       addr.nl_pid = 0;
+       addr.nl_groups = 0xffffffff;
+
+       ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr));
+       if (ret < 0) {
+               RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n");
+               goto err;
+       }
+
+       return 0;
+err:
+       close(intr_handle.fd);
+       intr_handle.fd = -1;
+       return ret;
+}
+
+static int
+dev_uev_parse(const char *buf, struct rte_dev_event *event, int length)
+{
+       char action[EAL_UEV_MSG_ELEM_LEN];
+       char subsystem[EAL_UEV_MSG_ELEM_LEN];
+       char pci_slot_name[EAL_UEV_MSG_ELEM_LEN];
+       int i = 0;
+
+       memset(action, 0, EAL_UEV_MSG_ELEM_LEN);
+       memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN);
+       memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN);
+
+       while (i < length) {
+               for (; i < length; i++) {
+                       if (*buf)
+                               break;
+                       buf++;
+               }
+               /**
+                * check device uevent from kernel side, no need to check
+                * uevent from udev.
+                */
+               if (!strncmp(buf, "libudev", 7)) {
+                       buf += 7;
+                       i += 7;
+                       return -1;
+               }
+               if (!strncmp(buf, "ACTION=", 7)) {
+                       buf += 7;
+                       i += 7;
+                       strlcpy(action, buf, sizeof(action));
+               } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
+                       buf += 10;
+                       i += 10;
+                       strlcpy(subsystem, buf, sizeof(subsystem));
+               } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
+                       buf += 14;
+                       i += 14;
+                       strlcpy(pci_slot_name, buf, sizeof(subsystem));
+                       event->devname = strdup(pci_slot_name);
+               }
+               for (; i < length; i++) {
+                       if (*buf == '\0')
+                               break;
+                       buf++;
+               }
+       }
+
+       /* parse the subsystem layer */
+       if (!strncmp(subsystem, "uio", 3))
+               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO;
+       else if (!strncmp(subsystem, "pci", 3))
+               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI;
+       else if (!strncmp(subsystem, "vfio", 4))
+               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO;
+       else
+               return -1;
+
+       /* parse the action type */
+       if (!strncmp(action, "add", 3))
+               event->type = RTE_DEV_EVENT_ADD;
+       else if (!strncmp(action, "remove", 6))
+               event->type = RTE_DEV_EVENT_REMOVE;
+       else
+               return -1;
+       return 0;
+}
+
+static void
+dev_delayed_unregister(void *param)
+{
+       rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param);
+       close(intr_handle.fd);
+       intr_handle.fd = -1;
+}
+
+static void
+dev_uev_handler(__rte_unused void *param)
+{
+       struct rte_dev_event uevent;
+       int ret;
+       char buf[EAL_UEV_MSG_LEN];
+       struct rte_bus *bus;
+       struct rte_device *dev;
+       const char *busname = "";
+
+       memset(&uevent, 0, sizeof(struct rte_dev_event));
+       memset(buf, 0, EAL_UEV_MSG_LEN);
+
+       ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT);
+       if (ret < 0 && errno == EAGAIN)
+               return;
+       else if (ret <= 0) {
+               /* connection is closed or broken, can not up again. */
+               RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n");
+               rte_eal_alarm_set(1, dev_delayed_unregister, NULL);
+               return;
+       }
+
+       ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN);
+       if (ret < 0) {
+               RTE_LOG(DEBUG, EAL, "It is not an valid event "
+                       "that need to be handle.\n");
+               return;
+       }
+
+       RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
+               uevent.devname, uevent.type, uevent.subsystem);
+
+       switch (uevent.subsystem) {
+       case EAL_DEV_EVENT_SUBSYSTEM_PCI:
+       case EAL_DEV_EVENT_SUBSYSTEM_UIO:
+               busname = "pci";
+               break;
+       default:
+               break;
+       }
+
+       if (uevent.devname) {
+               if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
+                       rte_spinlock_lock(&failure_handle_lock);
+                       bus = rte_bus_find_by_name(busname);
+                       if (bus == NULL) {
+                               RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
+                                       busname);
+                               goto failure_handle_err;
+                       }
+
+                       dev = bus->find_device(NULL, cmp_dev_name,
+                                              uevent.devname);
+                       if (dev == NULL) {
+                               RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
+                                       "bus (%s)\n", uevent.devname, busname);
+                               goto failure_handle_err;
+                       }
+
+                       ret = bus->hot_unplug_handler(dev);
+                       if (ret) {
+                               RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
+                                       "for device (%s)\n", dev->name);
+                       }
+                       rte_spinlock_unlock(&failure_handle_lock);
+               }
+               rte_dev_event_callback_process(uevent.devname, uevent.type);
+       }
+
+       return;
+
+failure_handle_err:
+       rte_spinlock_unlock(&failure_handle_lock);
+}
+
+int __rte_experimental
+rte_dev_event_monitor_start(void)
+{
+       int ret;
+
+       if (monitor_started)
+               return 0;
+
+       ret = dev_uev_socket_fd_create();
+       if (ret) {
+               RTE_LOG(ERR, EAL, "error create device event fd.\n");
+               return -1;
+       }
+
+       intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT;
+       ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "fail to register uevent callback.\n");
+               return -1;
+       }
+
+       monitor_started = true;
+
+       return 0;
+}
+
+int __rte_experimental
+rte_dev_event_monitor_stop(void)
+{
+       int ret;
+
+       if (!monitor_started)
+               return 0;
+
+       ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler,
+                                          (void *)-1);
+       if (ret < 0) {
+               RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n");
+               return ret;
+       }
+
+       close(intr_handle.fd);
+       intr_handle.fd = -1;
+       monitor_started = false;
+
+       return 0;
+}
+
+int
+dev_sigbus_handler_register(void)
+{
+       sigset_t mask;
+       struct sigaction action;
+
+       rte_errno = 0;
+
+       if (sigbus_need_recover)
+               return 0;
+
+       sigemptyset(&mask);
+       sigaddset(&mask, SIGBUS);
+       action.sa_flags = SA_SIGINFO;
+       action.sa_mask = mask;
+       action.sa_sigaction = sigbus_handler;
+       sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
+
+       return rte_errno;
+}
+
+int
+dev_sigbus_handler_unregister(void)
+{
+       rte_errno = 0;
+
+       sigbus_action_recover();
+
+       return rte_errno;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_enable(void)
+{
+       int ret = 0;
+
+       ret = dev_sigbus_handler_register();
+       if (ret < 0)
+               RTE_LOG(ERR, EAL,
+                       "fail to register sigbus handler for devices.\n");
+
+       hotplug_handle = true;
+
+       return ret;
+}
+
+int __rte_experimental
+rte_dev_hotplug_handle_disable(void)
+{
+       int ret = 0;
+
+       ret = dev_sigbus_handler_unregister();
+       if (ret < 0)
+               RTE_LOG(ERR, EAL,
+                       "fail to unregister sigbus handler for devices.\n");
+
+       hotplug_handle = false;
+
+       return ret;
+}
diff --git a/lib/librte_eal/linux/eal/eal_hugepage_info.c b/lib/librte_eal/linux/eal/eal_hugepage_info.c
new file mode 100644 (file)
index 0000000..0eab1cf
--- /dev/null
@@ -0,0 +1,526 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/types.h>
+#include <sys/file.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <sys/stat.h>
+
+#include <linux/mman.h> /* for hugetlb-related flags */
+
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_common.h>
+#include "rte_string_fns.h"
+#include "eal_internal_cfg.h"
+#include "eal_hugepages.h"
+#include "eal_filesystem.h"
+
+static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
+static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+map_shared_memory(const char *filename, const size_t mem_size, int flags)
+{
+       void *retval;
+       int fd = open(filename, flags, 0666);
+       if (fd < 0)
+               return NULL;
+       if (ftruncate(fd, mem_size) < 0) {
+               close(fd);
+               return NULL;
+       }
+       retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+                       MAP_SHARED, fd, 0);
+       close(fd);
+       return retval;
+}
+
+static void *
+open_shared_memory(const char *filename, const size_t mem_size)
+{
+       return map_shared_memory(filename, mem_size, O_RDWR);
+}
+
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+       return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
+}
+
+/* this function is only called from eal_hugepage_info_init which itself
+ * is only called from a primary process */
+static uint32_t
+get_num_hugepages(const char *subdir)
+{
+       char path[PATH_MAX];
+       long unsigned resv_pages, num_pages = 0;
+       const char *nr_hp_file = "free_hugepages";
+       const char *nr_rsvd_file = "resv_hugepages";
+
+       /* first, check how many reserved pages kernel reports */
+       snprintf(path, sizeof(path), "%s/%s/%s",
+                       sys_dir_path, subdir, nr_rsvd_file);
+       if (eal_parse_sysfs_value(path, &resv_pages) < 0)
+               return 0;
+
+       snprintf(path, sizeof(path), "%s/%s/%s",
+                       sys_dir_path, subdir, nr_hp_file);
+       if (eal_parse_sysfs_value(path, &num_pages) < 0)
+               return 0;
+
+       if (num_pages == 0)
+               RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+                               subdir);
+
+       /* adjust num_pages */
+       if (num_pages >= resv_pages)
+               num_pages -= resv_pages;
+       else if (resv_pages)
+               num_pages = 0;
+
+       /* we want to return a uint32_t and more than this looks suspicious
+        * anyway ... */
+       if (num_pages > UINT32_MAX)
+               num_pages = UINT32_MAX;
+
+       return num_pages;
+}
+
+static uint32_t
+get_num_hugepages_on_node(const char *subdir, unsigned int socket)
+{
+       char path[PATH_MAX], socketpath[PATH_MAX];
+       DIR *socketdir;
+       unsigned long num_pages = 0;
+       const char *nr_hp_file = "free_hugepages";
+
+       snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
+               sys_pages_numa_dir_path, socket);
+
+       socketdir = opendir(socketpath);
+       if (socketdir) {
+               /* Keep calm and carry on */
+               closedir(socketdir);
+       } else {
+               /* Can't find socket dir, so ignore it */
+               return 0;
+       }
+
+       snprintf(path, sizeof(path), "%s/%s/%s",
+                       socketpath, subdir, nr_hp_file);
+       if (eal_parse_sysfs_value(path, &num_pages) < 0)
+               return 0;
+
+       if (num_pages == 0)
+               RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
+                               subdir);
+
+       /*
+        * we want to return a uint32_t and more than this looks suspicious
+        * anyway ...
+        */
+       if (num_pages > UINT32_MAX)
+               num_pages = UINT32_MAX;
+
+       return num_pages;
+}
+
+static uint64_t
+get_default_hp_size(void)
+{
+       const char proc_meminfo[] = "/proc/meminfo";
+       const char str_hugepagesz[] = "Hugepagesize:";
+       unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
+       char buffer[256];
+       unsigned long long size = 0;
+
+       FILE *fd = fopen(proc_meminfo, "r");
+       if (fd == NULL)
+               rte_panic("Cannot open %s\n", proc_meminfo);
+       while(fgets(buffer, sizeof(buffer), fd)){
+               if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
+                       size = rte_str_to_size(&buffer[hugepagesz_len]);
+                       break;
+               }
+       }
+       fclose(fd);
+       if (size == 0)
+               rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
+       return size;
+}
+
+static int
+get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
+{
+       enum proc_mount_fieldnames {
+               DEVICE = 0,
+               MOUNTPT,
+               FSTYPE,
+               OPTIONS,
+               _FIELDNAME_MAX
+       };
+       static uint64_t default_size = 0;
+       const char proc_mounts[] = "/proc/mounts";
+       const char hugetlbfs_str[] = "hugetlbfs";
+       const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
+       const char pagesize_opt[] = "pagesize=";
+       const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
+       const char split_tok = ' ';
+       char *splitstr[_FIELDNAME_MAX];
+       char buf[BUFSIZ];
+       int retval = -1;
+
+       FILE *fd = fopen(proc_mounts, "r");
+       if (fd == NULL)
+               rte_panic("Cannot open %s\n", proc_mounts);
+
+       if (default_size == 0)
+               default_size = get_default_hp_size();
+
+       while (fgets(buf, sizeof(buf), fd)){
+               if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
+                               split_tok) != _FIELDNAME_MAX) {
+                       RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
+                       break; /* return NULL */
+               }
+
+               /* we have a specified --huge-dir option, only examine that dir */
+               if (internal_config.hugepage_dir != NULL &&
+                               strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0)
+                       continue;
+
+               if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){
+                       const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
+
+                       /* if no explicit page size, the default page size is compared */
+                       if (pagesz_str == NULL){
+                               if (hugepage_sz == default_size){
+                                       strlcpy(hugedir, splitstr[MOUNTPT], len);
+                                       retval = 0;
+                                       break;
+                               }
+                       }
+                       /* there is an explicit page size, so check it */
+                       else {
+                               uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
+                               if (pagesz == hugepage_sz) {
+                                       strlcpy(hugedir, splitstr[MOUNTPT], len);
+                                       retval = 0;
+                                       break;
+                               }
+                       }
+               } /* end if strncmp hugetlbfs */
+       } /* end while fgets */
+
+       fclose(fd);
+       return retval;
+}
+
+/*
+ * Clear the hugepage directory of whatever hugepage files
+ * there are. Checks if the file is locked (i.e.
+ * if it's in use by another DPDK process).
+ */
+static int
+clear_hugedir(const char * hugedir)
+{
+       DIR *dir;
+       struct dirent *dirent;
+       int dir_fd, fd, lck_result;
+       const char filter[] = "*map_*"; /* matches hugepage files */
+
+       /* open directory */
+       dir = opendir(hugedir);
+       if (!dir) {
+               RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
+                               hugedir);
+               goto error;
+       }
+       dir_fd = dirfd(dir);
+
+       dirent = readdir(dir);
+       if (!dirent) {
+               RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
+                               hugedir);
+               goto error;
+       }
+
+       while(dirent != NULL){
+               /* skip files that don't match the hugepage pattern */
+               if (fnmatch(filter, dirent->d_name, 0) > 0) {
+                       dirent = readdir(dir);
+                       continue;
+               }
+
+               /* try and lock the file */
+               fd = openat(dir_fd, dirent->d_name, O_RDONLY);
+
+               /* skip to next file */
+               if (fd == -1) {
+                       dirent = readdir(dir);
+                       continue;
+               }
+
+               /* non-blocking lock */
+               lck_result = flock(fd, LOCK_EX | LOCK_NB);
+
+               /* if lock succeeds, remove the file */
+               if (lck_result != -1)
+                       unlinkat(dir_fd, dirent->d_name, 0);
+               close (fd);
+               dirent = readdir(dir);
+       }
+
+       closedir(dir);
+       return 0;
+
+error:
+       if (dir)
+               closedir(dir);
+
+       RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
+               strerror(errno));
+
+       return -1;
+}
+
+static int
+compare_hpi(const void *a, const void *b)
+{
+       const struct hugepage_info *hpi_a = a;
+       const struct hugepage_info *hpi_b = b;
+
+       return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
+}
+
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+       uint64_t total_pages = 0;
+       unsigned int i;
+
+       /*
+        * first, try to put all hugepages into relevant sockets, but
+        * if first attempts fails, fall back to collecting all pages
+        * in one socket and sorting them later
+        */
+       total_pages = 0;
+       /* we also don't want to do this for legacy init */
+       if (!internal_config.legacy_mem)
+               for (i = 0; i < rte_socket_count(); i++) {
+                       int socket = rte_socket_id_by_idx(i);
+                       unsigned int num_pages =
+                                       get_num_hugepages_on_node(
+                                               dirent->d_name, socket);
+                       hpi->num_pages[socket] = num_pages;
+                       total_pages += num_pages;
+               }
+       /*
+        * we failed to sort memory from the get go, so fall
+        * back to old way
+        */
+       if (total_pages == 0) {
+               hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+               /* for 32-bit systems, limit number of hugepages to
+                * 1GB per page size */
+               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+                               RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+       }
+}
+
+static int
+hugepage_info_init(void)
+{      const char dirent_start_text[] = "hugepages-";
+       const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
+       unsigned int i, num_sizes = 0;
+       DIR *dir;
+       struct dirent *dirent;
+
+       dir = opendir(sys_dir_path);
+       if (dir == NULL) {
+               RTE_LOG(ERR, EAL,
+                       "Cannot open directory %s to read system hugepage info\n",
+                       sys_dir_path);
+               return -1;
+       }
+
+       for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
+               struct hugepage_info *hpi;
+
+               if (strncmp(dirent->d_name, dirent_start_text,
+                           dirent_start_len) != 0)
+                       continue;
+
+               if (num_sizes >= MAX_HUGEPAGE_SIZES)
+                       break;
+
+               hpi = &internal_config.hugepage_info[num_sizes];
+               hpi->hugepage_sz =
+                       rte_str_to_size(&dirent->d_name[dirent_start_len]);
+
+               /* first, check if we have a mountpoint */
+               if (get_hugepage_dir(hpi->hugepage_sz,
+                       hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
+                       uint32_t num_pages;
+
+                       num_pages = get_num_hugepages(dirent->d_name);
+                       if (num_pages > 0)
+                               RTE_LOG(NOTICE, EAL,
+                                       "%" PRIu32 " hugepages of size "
+                                       "%" PRIu64 " reserved, but no mounted "
+                                       "hugetlbfs found for that size\n",
+                                       num_pages, hpi->hugepage_sz);
+                       /* if we have kernel support for reserving hugepages
+                        * through mmap, and we're in in-memory mode, treat this
+                        * page size as valid. we cannot be in legacy mode at
+                        * this point because we've checked this earlier in the
+                        * init process.
+                        */
+#ifdef MAP_HUGE_SHIFT
+                       if (internal_config.in_memory) {
+                               RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
+                                       "hugepages of size %" PRIu64 " bytes "
+                                       "will be allocated anonymously\n",
+                                       hpi->hugepage_sz);
+                               calc_num_pages(hpi, dirent);
+                               num_sizes++;
+                       }
+#endif
+                       continue;
+               }
+
+               /* try to obtain a writelock */
+               hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
+
+               /* if blocking lock failed */
+               if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
+                       RTE_LOG(CRIT, EAL,
+                               "Failed to lock hugepage directory!\n");
+                       break;
+               }
+               /* clear out the hugepages dir from unused pages */
+               if (clear_hugedir(hpi->hugedir) == -1)
+                       break;
+
+               calc_num_pages(hpi, dirent);
+
+               num_sizes++;
+       }
+       closedir(dir);
+
+       /* something went wrong, and we broke from the for loop above */
+       if (dirent != NULL)
+               return -1;
+
+       internal_config.num_hugepage_sizes = num_sizes;
+
+       /* sort the page directory entries by size, largest to smallest */
+       qsort(&internal_config.hugepage_info[0], num_sizes,
+             sizeof(internal_config.hugepage_info[0]), compare_hpi);
+
+       /* now we have all info, check we have at least one valid size */
+       for (i = 0; i < num_sizes; i++) {
+               /* pages may no longer all be on socket 0, so check all */
+               unsigned int j, num_pages = 0;
+               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+
+               for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+                       num_pages += hpi->num_pages[j];
+               if (num_pages > 0)
+                       return 0;
+       }
+
+       /* no valid hugepage mounts available, return error */
+       return -1;
+}
+
+/*
+ * when we initialize the hugepage info, everything goes
+ * to socket 0 by default. it will later get sorted by memory
+ * initialization procedure.
+ */
+int
+eal_hugepage_info_init(void)
+{
+       struct hugepage_info *hpi, *tmp_hpi;
+       unsigned int i;
+
+       if (hugepage_info_init() < 0)
+               return -1;
+
+       /* for no shared files mode, we're done */
+       if (internal_config.no_shconf)
+               return 0;
+
+       hpi = &internal_config.hugepage_info[0];
+
+       tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
+                       sizeof(internal_config.hugepage_info));
+       if (tmp_hpi == NULL) {
+               RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+               return -1;
+       }
+
+       memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
+
+       /* we've copied file descriptors along with everything else, but they
+        * will be invalid in secondary process, so overwrite them
+        */
+       for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+               struct hugepage_info *tmp = &tmp_hpi[i];
+               tmp->lock_descriptor = -1;
+       }
+
+       if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+               RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+               return -1;
+       }
+       return 0;
+}
+
+int eal_hugepage_info_read(void)
+{
+       struct hugepage_info *hpi = &internal_config.hugepage_info[0];
+       struct hugepage_info *tmp_hpi;
+
+       tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
+                                 sizeof(internal_config.hugepage_info));
+       if (tmp_hpi == NULL) {
+               RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
+               return -1;
+       }
+
+       memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
+
+       if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
+               RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
+               return -1;
+       }
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c b/lib/librte_eal/linux/eal/eal_interrupts.c
new file mode 100644 (file)
index 0000000..cbac451
--- /dev/null
@@ -0,0 +1,1326 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/queue.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/epoll.h>
+#include <sys/signalfd.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <rte_common.h>
+#include <rte_interrupts.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_branch_prediction.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_errno.h>
+#include <rte_spinlock.h>
+#include <rte_pause.h>
+#include <rte_vfio.h>
+
+#include "eal_private.h"
+#include "eal_vfio.h"
+#include "eal_thread.h"
+
+#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
+#define NB_OTHER_INTR               1
+
+static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
+
+/**
+ * union for pipe fds.
+ */
+union intr_pipefds{
+       struct {
+               int pipefd[2];
+       };
+       struct {
+               int readfd;
+               int writefd;
+       };
+};
+
+/**
+ * union buffer for reading on different devices
+ */
+union rte_intr_read_buffer {
+       int uio_intr_count;              /* for uio device */
+#ifdef VFIO_PRESENT
+       uint64_t vfio_intr_count;        /* for vfio device */
+#endif
+       uint64_t timerfd_num;            /* for timerfd */
+       char charbuf[16];                /* for others */
+};
+
+TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
+TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
+
+struct rte_intr_callback {
+       TAILQ_ENTRY(rte_intr_callback) next;
+       rte_intr_callback_fn cb_fn;  /**< callback address */
+       void *cb_arg;                /**< parameter for callback */
+};
+
+struct rte_intr_source {
+       TAILQ_ENTRY(rte_intr_source) next;
+       struct rte_intr_handle intr_handle; /**< interrupt handle */
+       struct rte_intr_cb_list callbacks;  /**< user callbacks */
+       uint32_t active;
+};
+
+/* global spinlock for interrupt data operation */
+static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* union buffer for pipe read/write */
+static union intr_pipefds intr_pipe;
+
+/* interrupt sources list */
+static struct rte_intr_source_list intr_sources;
+
+/* interrupt handling thread */
+static pthread_t intr_thread;
+
+/* VFIO interrupts */
+#ifdef VFIO_PRESENT
+
+#define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
+/* irq set buffer length for queue interrupts and LSC interrupt */
+#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
+                             sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
+
+/* enable legacy (INTx) interrupts */
+static int
+vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
+       struct vfio_irq_set *irq_set;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       int len, ret;
+       int *fd_ptr;
+
+       len = sizeof(irq_set_buf);
+
+       /* enable INTx */
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+       irq_set->start = 0;
+       fd_ptr = (int *) &irq_set->data;
+       *fd_ptr = intr_handle->fd;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+
+       /* unmask INTx after enabling */
+       memset(irq_set, 0, len);
+       len = sizeof(struct vfio_irq_set);
+       irq_set->argsz = len;
+       irq_set->count = 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
+       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+       return 0;
+}
+
+/* disable legacy (INTx) interrupts */
+static int
+vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
+       struct vfio_irq_set *irq_set;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       int len, ret;
+
+       len = sizeof(struct vfio_irq_set);
+
+       /* mask interrupts before disabling */
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
+       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+
+       /* disable INTx*/
+       memset(irq_set, 0, len);
+       irq_set->argsz = len;
+       irq_set->count = 0;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL,
+                       "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
+               return -1;
+       }
+       return 0;
+}
+
+/* enable MSI interrupts */
+static int
+vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
+       int len, ret;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       struct vfio_irq_set *irq_set;
+       int *fd_ptr;
+
+       len = sizeof(irq_set_buf);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+       irq_set->start = 0;
+       fd_ptr = (int *) &irq_set->data;
+       *fd_ptr = intr_handle->fd;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+       return 0;
+}
+
+/* disable MSI interrupts */
+static int
+vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
+       struct vfio_irq_set *irq_set;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       int len, ret;
+
+       len = sizeof(struct vfio_irq_set);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 0;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret)
+               RTE_LOG(ERR, EAL,
+                       "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
+
+       return ret;
+}
+
+/* enable MSI-X interrupts */
+static int
+vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
+       int len, ret;
+       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+       struct vfio_irq_set *irq_set;
+       int *fd_ptr;
+
+       len = sizeof(irq_set_buf);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
+       irq_set->count = intr_handle->max_intr ?
+               (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
+               RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+       irq_set->start = 0;
+       fd_ptr = (int *) &irq_set->data;
+       /* INTR vector offset 0 reserve for non-efds mapping */
+       fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
+       memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
+               sizeof(*intr_handle->efds) * intr_handle->nb_efd);
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+/* disable MSI-X interrupts */
+static int
+vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
+       struct vfio_irq_set *irq_set;
+       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
+       int len, ret;
+
+       len = sizeof(struct vfio_irq_set);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 0;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret)
+               RTE_LOG(ERR, EAL,
+                       "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
+
+       return ret;
+}
+
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+/* enable req notifier */
+static int
+vfio_enable_req(const struct rte_intr_handle *intr_handle)
+{
+       int len, ret;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       struct vfio_irq_set *irq_set;
+       int *fd_ptr;
+
+       len = sizeof(irq_set_buf);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 1;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                        VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+       irq_set->start = 0;
+       fd_ptr = (int *) &irq_set->data;
+       *fd_ptr = intr_handle->fd;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret) {
+               RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
+                                               intr_handle->fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+/* disable req notifier */
+static int
+vfio_disable_req(const struct rte_intr_handle *intr_handle)
+{
+       struct vfio_irq_set *irq_set;
+       char irq_set_buf[IRQ_SET_BUF_LEN];
+       int len, ret;
+
+       len = sizeof(struct vfio_irq_set);
+
+       irq_set = (struct vfio_irq_set *) irq_set_buf;
+       irq_set->argsz = len;
+       irq_set->count = 0;
+       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
+       irq_set->start = 0;
+
+       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+       if (ret)
+               RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
+                       intr_handle->fd);
+
+       return ret;
+}
+#endif
+#endif
+
+static int
+uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+       unsigned char command_high;
+
+       /* use UIO config file descriptor for uio_pci_generic */
+       if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+               RTE_LOG(ERR, EAL,
+                       "Error reading interrupts status for fd %d\n",
+                       intr_handle->uio_cfg_fd);
+               return -1;
+       }
+       /* disable interrupts */
+       command_high |= 0x4;
+       if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+               RTE_LOG(ERR, EAL,
+                       "Error disabling interrupts for fd %d\n",
+                       intr_handle->uio_cfg_fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+       unsigned char command_high;
+
+       /* use UIO config file descriptor for uio_pci_generic */
+       if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+               RTE_LOG(ERR, EAL,
+                       "Error reading interrupts status for fd %d\n",
+                       intr_handle->uio_cfg_fd);
+               return -1;
+       }
+       /* enable interrupts */
+       command_high &= ~0x4;
+       if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
+               RTE_LOG(ERR, EAL,
+                       "Error enabling interrupts for fd %d\n",
+                       intr_handle->uio_cfg_fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+uio_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+       const int value = 0;
+
+       if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+               RTE_LOG(ERR, EAL,
+                       "Error disabling interrupts for fd %d (%s)\n",
+                       intr_handle->fd, strerror(errno));
+               return -1;
+       }
+       return 0;
+}
+
+static int
+uio_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+       const int value = 1;
+
+       if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
+               RTE_LOG(ERR, EAL,
+                       "Error enabling interrupts for fd %d (%s)\n",
+                       intr_handle->fd, strerror(errno));
+               return -1;
+       }
+       return 0;
+}
+
+int
+rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
+                       rte_intr_callback_fn cb, void *cb_arg)
+{
+       int ret, wake_thread;
+       struct rte_intr_source *src;
+       struct rte_intr_callback *callback;
+
+       wake_thread = 0;
+
+       /* first do parameter checking */
+       if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
+               RTE_LOG(ERR, EAL,
+                       "Registering with invalid input parameter\n");
+               return -EINVAL;
+       }
+
+       /* allocate a new interrupt callback entity */
+       callback = calloc(1, sizeof(*callback));
+       if (callback == NULL) {
+               RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+               return -ENOMEM;
+       }
+       callback->cb_fn = cb;
+       callback->cb_arg = cb_arg;
+
+       rte_spinlock_lock(&intr_lock);
+
+       /* check if there is at least one callback registered for the fd */
+       TAILQ_FOREACH(src, &intr_sources, next) {
+               if (src->intr_handle.fd == intr_handle->fd) {
+                       /* we had no interrupts for this */
+                       if (TAILQ_EMPTY(&src->callbacks))
+                               wake_thread = 1;
+
+                       TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+                       ret = 0;
+                       break;
+               }
+       }
+
+       /* no existing callbacks for this - add new source */
+       if (src == NULL) {
+               src = calloc(1, sizeof(*src));
+               if (src == NULL) {
+                       RTE_LOG(ERR, EAL, "Can not allocate memory\n");
+                       free(callback);
+                       ret = -ENOMEM;
+               } else {
+                       src->intr_handle = *intr_handle;
+                       TAILQ_INIT(&src->callbacks);
+                       TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
+                       TAILQ_INSERT_TAIL(&intr_sources, src, next);
+                       wake_thread = 1;
+                       ret = 0;
+               }
+       }
+
+       rte_spinlock_unlock(&intr_lock);
+
+       /**
+        * check if need to notify the pipe fd waited by epoll_wait to
+        * rebuild the wait list.
+        */
+       if (wake_thread)
+               if (write(intr_pipe.writefd, "1", 1) < 0)
+                       return -EPIPE;
+
+       return ret;
+}
+
+int
+rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
+                       rte_intr_callback_fn cb_fn, void *cb_arg)
+{
+       int ret;
+       struct rte_intr_source *src;
+       struct rte_intr_callback *cb, *next;
+
+       /* do parameter checking first */
+       if (intr_handle == NULL || intr_handle->fd < 0) {
+               RTE_LOG(ERR, EAL,
+               "Unregistering with invalid input parameter\n");
+               return -EINVAL;
+       }
+
+       rte_spinlock_lock(&intr_lock);
+
+       /* check if the insterrupt source for the fd is existent */
+       TAILQ_FOREACH(src, &intr_sources, next)
+               if (src->intr_handle.fd == intr_handle->fd)
+                       break;
+
+       /* No interrupt source registered for the fd */
+       if (src == NULL) {
+               ret = -ENOENT;
+
+       /* interrupt source has some active callbacks right now. */
+       } else if (src->active != 0) {
+               ret = -EAGAIN;
+
+       /* ok to remove. */
+       } else {
+               ret = 0;
+
+               /*walk through the callbacks and remove all that match. */
+               for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
+
+                       next = TAILQ_NEXT(cb, next);
+
+                       if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
+                                       cb->cb_arg == cb_arg)) {
+                               TAILQ_REMOVE(&src->callbacks, cb, next);
+                               free(cb);
+                               ret++;
+                       }
+               }
+
+               /* all callbacks for that source are removed. */
+               if (TAILQ_EMPTY(&src->callbacks)) {
+                       TAILQ_REMOVE(&intr_sources, src, next);
+                       free(src);
+               }
+       }
+
+       rte_spinlock_unlock(&intr_lock);
+
+       /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
+       if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
+               ret = -EPIPE;
+       }
+
+       return ret;
+}
+
+int
+rte_intr_enable(const struct rte_intr_handle *intr_handle)
+{
+       if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+               return 0;
+
+       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+               return -1;
+
+       switch (intr_handle->type){
+       /* write to the uio fd to enable the interrupt */
+       case RTE_INTR_HANDLE_UIO:
+               if (uio_intr_enable(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_UIO_INTX:
+               if (uio_intx_intr_enable(intr_handle))
+                       return -1;
+               break;
+       /* not used at this moment */
+       case RTE_INTR_HANDLE_ALARM:
+               return -1;
+#ifdef VFIO_PRESENT
+       case RTE_INTR_HANDLE_VFIO_MSIX:
+               if (vfio_enable_msix(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_VFIO_MSI:
+               if (vfio_enable_msi(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_VFIO_LEGACY:
+               if (vfio_enable_intx(intr_handle))
+                       return -1;
+               break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+       case RTE_INTR_HANDLE_VFIO_REQ:
+               if (vfio_enable_req(intr_handle))
+                       return -1;
+               break;
+#endif
+#endif
+       /* not used at this moment */
+       case RTE_INTR_HANDLE_DEV_EVENT:
+               return -1;
+       /* unknown handle type */
+       default:
+               RTE_LOG(ERR, EAL,
+                       "Unknown handle type of fd %d\n",
+                                       intr_handle->fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+rte_intr_disable(const struct rte_intr_handle *intr_handle)
+{
+       if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
+               return 0;
+
+       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
+               return -1;
+
+       switch (intr_handle->type){
+       /* write to the uio fd to disable the interrupt */
+       case RTE_INTR_HANDLE_UIO:
+               if (uio_intr_disable(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_UIO_INTX:
+               if (uio_intx_intr_disable(intr_handle))
+                       return -1;
+               break;
+       /* not used at this moment */
+       case RTE_INTR_HANDLE_ALARM:
+               return -1;
+#ifdef VFIO_PRESENT
+       case RTE_INTR_HANDLE_VFIO_MSIX:
+               if (vfio_disable_msix(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_VFIO_MSI:
+               if (vfio_disable_msi(intr_handle))
+                       return -1;
+               break;
+       case RTE_INTR_HANDLE_VFIO_LEGACY:
+               if (vfio_disable_intx(intr_handle))
+                       return -1;
+               break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+       case RTE_INTR_HANDLE_VFIO_REQ:
+               if (vfio_disable_req(intr_handle))
+                       return -1;
+               break;
+#endif
+#endif
+       /* not used at this moment */
+       case RTE_INTR_HANDLE_DEV_EVENT:
+               return -1;
+       /* unknown handle type */
+       default:
+               RTE_LOG(ERR, EAL,
+                       "Unknown handle type of fd %d\n",
+                                       intr_handle->fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+eal_intr_process_interrupts(struct epoll_event *events, int nfds)
+{
+       bool call = false;
+       int n, bytes_read;
+       struct rte_intr_source *src;
+       struct rte_intr_callback *cb, *next;
+       union rte_intr_read_buffer buf;
+       struct rte_intr_callback active_cb;
+
+       for (n = 0; n < nfds; n++) {
+
+               /**
+                * if the pipe fd is ready to read, return out to
+                * rebuild the wait list.
+                */
+               if (events[n].data.fd == intr_pipe.readfd){
+                       int r = read(intr_pipe.readfd, buf.charbuf,
+                                       sizeof(buf.charbuf));
+                       RTE_SET_USED(r);
+                       return -1;
+               }
+               rte_spinlock_lock(&intr_lock);
+               TAILQ_FOREACH(src, &intr_sources, next)
+                       if (src->intr_handle.fd ==
+                                       events[n].data.fd)
+                               break;
+               if (src == NULL){
+                       rte_spinlock_unlock(&intr_lock);
+                       continue;
+               }
+
+               /* mark this interrupt source as active and release the lock. */
+               src->active = 1;
+               rte_spinlock_unlock(&intr_lock);
+
+               /* set the length to be read dor different handle type */
+               switch (src->intr_handle.type) {
+               case RTE_INTR_HANDLE_UIO:
+               case RTE_INTR_HANDLE_UIO_INTX:
+                       bytes_read = sizeof(buf.uio_intr_count);
+                       break;
+               case RTE_INTR_HANDLE_ALARM:
+                       bytes_read = sizeof(buf.timerfd_num);
+                       break;
+#ifdef VFIO_PRESENT
+               case RTE_INTR_HANDLE_VFIO_MSIX:
+               case RTE_INTR_HANDLE_VFIO_MSI:
+               case RTE_INTR_HANDLE_VFIO_LEGACY:
+                       bytes_read = sizeof(buf.vfio_intr_count);
+                       break;
+#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
+               case RTE_INTR_HANDLE_VFIO_REQ:
+                       bytes_read = 0;
+                       call = true;
+                       break;
+#endif
+#endif
+               case RTE_INTR_HANDLE_VDEV:
+               case RTE_INTR_HANDLE_EXT:
+                       bytes_read = 0;
+                       call = true;
+                       break;
+               case RTE_INTR_HANDLE_DEV_EVENT:
+                       bytes_read = 0;
+                       call = true;
+                       break;
+               default:
+                       bytes_read = 1;
+                       break;
+               }
+
+               if (bytes_read > 0) {
+                       /**
+                        * read out to clear the ready-to-be-read flag
+                        * for epoll_wait.
+                        */
+                       bytes_read = read(events[n].data.fd, &buf, bytes_read);
+                       if (bytes_read < 0) {
+                               if (errno == EINTR || errno == EWOULDBLOCK)
+                                       continue;
+
+                               RTE_LOG(ERR, EAL, "Error reading from file "
+                                       "descriptor %d: %s\n",
+                                       events[n].data.fd,
+                                       strerror(errno));
+                               /*
+                                * The device is unplugged or buggy, remove
+                                * it as an interrupt source and return to
+                                * force the wait list to be rebuilt.
+                                */
+                               rte_spinlock_lock(&intr_lock);
+                               TAILQ_REMOVE(&intr_sources, src, next);
+                               rte_spinlock_unlock(&intr_lock);
+
+                               for (cb = TAILQ_FIRST(&src->callbacks); cb;
+                                                       cb = next) {
+                                       next = TAILQ_NEXT(cb, next);
+                                       TAILQ_REMOVE(&src->callbacks, cb, next);
+                                       free(cb);
+                               }
+                               free(src);
+                               return -1;
+                       } else if (bytes_read == 0)
+                               RTE_LOG(ERR, EAL, "Read nothing from file "
+                                       "descriptor %d\n", events[n].data.fd);
+                       else
+                               call = true;
+               }
+
+               /* grab a lock, again to call callbacks and update status. */
+               rte_spinlock_lock(&intr_lock);
+
+               if (call) {
+
+                       /* Finally, call all callbacks. */
+                       TAILQ_FOREACH(cb, &src->callbacks, next) {
+
+                               /* make a copy and unlock. */
+                               active_cb = *cb;
+                               rte_spinlock_unlock(&intr_lock);
+
+                               /* call the actual callback */
+                               active_cb.cb_fn(active_cb.cb_arg);
+
+                               /*get the lock back. */
+                               rte_spinlock_lock(&intr_lock);
+                       }
+               }
+
+               /* we done with that interrupt source, release it. */
+               src->active = 0;
+               rte_spinlock_unlock(&intr_lock);
+       }
+
+       return 0;
+}
+
+/**
+ * It handles all the interrupts.
+ *
+ * @param pfd
+ *  epoll file descriptor.
+ * @param totalfds
+ *  The number of file descriptors added in epoll.
+ *
+ * @return
+ *  void
+ */
+static void
+eal_intr_handle_interrupts(int pfd, unsigned totalfds)
+{
+       struct epoll_event events[totalfds];
+       int nfds = 0;
+
+       for(;;) {
+               nfds = epoll_wait(pfd, events, totalfds,
+                       EAL_INTR_EPOLL_WAIT_FOREVER);
+               /* epoll_wait fail */
+               if (nfds < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       RTE_LOG(ERR, EAL,
+                               "epoll_wait returns with fail\n");
+                       return;
+               }
+               /* epoll_wait timeout, will never happens here */
+               else if (nfds == 0)
+                       continue;
+               /* epoll_wait has at least one fd ready to read */
+               if (eal_intr_process_interrupts(events, nfds) < 0)
+                       return;
+       }
+}
+
+/**
+ * It builds/rebuilds up the epoll file descriptor with all the
+ * file descriptors being waited on. Then handles the interrupts.
+ *
+ * @param arg
+ *  pointer. (unused)
+ *
+ * @return
+ *  never return;
+ */
+static __attribute__((noreturn)) void *
+eal_intr_thread_main(__rte_unused void *arg)
+{
+       struct epoll_event ev;
+
+       /* host thread, never break out */
+       for (;;) {
+               /* build up the epoll fd with all descriptors we are to
+                * wait on then pass it to the handle_interrupts function
+                */
+               static struct epoll_event pipe_event = {
+                       .events = EPOLLIN | EPOLLPRI,
+               };
+               struct rte_intr_source *src;
+               unsigned numfds = 0;
+
+               /* create epoll fd */
+               int pfd = epoll_create(1);
+               if (pfd < 0)
+                       rte_panic("Cannot create epoll instance\n");
+
+               pipe_event.data.fd = intr_pipe.readfd;
+               /**
+                * add pipe fd into wait list, this pipe is used to
+                * rebuild the wait list.
+                */
+               if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
+                                               &pipe_event) < 0) {
+                       rte_panic("Error adding fd to %d epoll_ctl, %s\n",
+                                       intr_pipe.readfd, strerror(errno));
+               }
+               numfds++;
+
+               rte_spinlock_lock(&intr_lock);
+
+               TAILQ_FOREACH(src, &intr_sources, next) {
+                       if (src->callbacks.tqh_first == NULL)
+                               continue; /* skip those with no callbacks */
+                       ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
+                       ev.data.fd = src->intr_handle.fd;
+
+                       /**
+                        * add all the uio device file descriptor
+                        * into wait list.
+                        */
+                       if (epoll_ctl(pfd, EPOLL_CTL_ADD,
+                                       src->intr_handle.fd, &ev) < 0){
+                               rte_panic("Error adding fd %d epoll_ctl, %s\n",
+                                       src->intr_handle.fd, strerror(errno));
+                       }
+                       else
+                               numfds++;
+               }
+               rte_spinlock_unlock(&intr_lock);
+               /* serve the interrupt */
+               eal_intr_handle_interrupts(pfd, numfds);
+
+               /**
+                * when we return, we need to rebuild the
+                * list of fds to monitor.
+                */
+               close(pfd);
+       }
+}
+
+int
+rte_eal_intr_init(void)
+{
+       int ret = 0;
+
+       /* init the global interrupt source head */
+       TAILQ_INIT(&intr_sources);
+
+       /**
+        * create a pipe which will be waited by epoll and notified to
+        * rebuild the wait list of epoll.
+        */
+       if (pipe(intr_pipe.pipefd) < 0) {
+               rte_errno = errno;
+               return -1;
+       }
+
+       /* create the host thread to wait/handle the interrupt */
+       ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
+                       eal_intr_thread_main, NULL);
+       if (ret != 0) {
+               rte_errno = -ret;
+               RTE_LOG(ERR, EAL,
+                       "Failed to create thread for interrupt handling\n");
+       }
+
+       return ret;
+}
+
+static void
+eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
+{
+       union rte_intr_read_buffer buf;
+       int bytes_read = 0;
+       int nbytes;
+
+       switch (intr_handle->type) {
+       case RTE_INTR_HANDLE_UIO:
+       case RTE_INTR_HANDLE_UIO_INTX:
+               bytes_read = sizeof(buf.uio_intr_count);
+               break;
+#ifdef VFIO_PRESENT
+       case RTE_INTR_HANDLE_VFIO_MSIX:
+       case RTE_INTR_HANDLE_VFIO_MSI:
+       case RTE_INTR_HANDLE_VFIO_LEGACY:
+               bytes_read = sizeof(buf.vfio_intr_count);
+               break;
+#endif
+       case RTE_INTR_HANDLE_VDEV:
+               bytes_read = intr_handle->efd_counter_size;
+               /* For vdev, number of bytes to read is set by driver */
+               break;
+       case RTE_INTR_HANDLE_EXT:
+               return;
+       default:
+               bytes_read = 1;
+               RTE_LOG(INFO, EAL, "unexpected intr type\n");
+               break;
+       }
+
+       /**
+        * read out to clear the ready-to-be-read flag
+        * for epoll_wait.
+        */
+       if (bytes_read == 0)
+               return;
+       do {
+               nbytes = read(fd, &buf, bytes_read);
+               if (nbytes < 0) {
+                       if (errno == EINTR || errno == EWOULDBLOCK ||
+                           errno == EAGAIN)
+                               continue;
+                       RTE_LOG(ERR, EAL,
+                               "Error reading from fd %d: %s\n",
+                               fd, strerror(errno));
+               } else if (nbytes == 0)
+                       RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
+               return;
+       } while (1);
+}
+
+static int
+eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
+                       struct rte_epoll_event *events)
+{
+       unsigned int i, count = 0;
+       struct rte_epoll_event *rev;
+
+       for (i = 0; i < n; i++) {
+               rev = evs[i].data.ptr;
+               if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
+                                                RTE_EPOLL_EXEC))
+                       continue;
+
+               events[count].status        = RTE_EPOLL_VALID;
+               events[count].fd            = rev->fd;
+               events[count].epfd          = rev->epfd;
+               events[count].epdata.event  = rev->epdata.event;
+               events[count].epdata.data   = rev->epdata.data;
+               if (rev->epdata.cb_fun)
+                       rev->epdata.cb_fun(rev->fd,
+                                          rev->epdata.cb_arg);
+
+               rte_compiler_barrier();
+               rev->status = RTE_EPOLL_VALID;
+               count++;
+       }
+       return count;
+}
+
+static inline int
+eal_init_tls_epfd(void)
+{
+       int pfd = epoll_create(255);
+
+       if (pfd < 0) {
+               RTE_LOG(ERR, EAL,
+                       "Cannot create epoll instance\n");
+               return -1;
+       }
+       return pfd;
+}
+
+int
+rte_intr_tls_epfd(void)
+{
+       if (RTE_PER_LCORE(_epfd) == -1)
+               RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
+
+       return RTE_PER_LCORE(_epfd);
+}
+
+int
+rte_epoll_wait(int epfd, struct rte_epoll_event *events,
+              int maxevents, int timeout)
+{
+       struct epoll_event evs[maxevents];
+       int rc;
+
+       if (!events) {
+               RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+               return -1;
+       }
+
+       /* using per thread epoll fd */
+       if (epfd == RTE_EPOLL_PER_THREAD)
+               epfd = rte_intr_tls_epfd();
+
+       while (1) {
+               rc = epoll_wait(epfd, evs, maxevents, timeout);
+               if (likely(rc > 0)) {
+                       /* epoll_wait has at least one fd ready to read */
+                       rc = eal_epoll_process_event(evs, rc, events);
+                       break;
+               } else if (rc < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       /* epoll_wait fail */
+                       RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
+                               strerror(errno));
+                       rc = -1;
+                       break;
+               } else {
+                       /* rc == 0, epoll_wait timed out */
+                       break;
+               }
+       }
+
+       return rc;
+}
+
+static inline void
+eal_epoll_data_safe_free(struct rte_epoll_event *ev)
+{
+       while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
+                                   RTE_EPOLL_INVALID))
+               while (ev->status != RTE_EPOLL_VALID)
+                       rte_pause();
+       memset(&ev->epdata, 0, sizeof(ev->epdata));
+       ev->fd = -1;
+       ev->epfd = -1;
+}
+
+int
+rte_epoll_ctl(int epfd, int op, int fd,
+             struct rte_epoll_event *event)
+{
+       struct epoll_event ev;
+
+       if (!event) {
+               RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
+               return -1;
+       }
+
+       /* using per thread epoll fd */
+       if (epfd == RTE_EPOLL_PER_THREAD)
+               epfd = rte_intr_tls_epfd();
+
+       if (op == EPOLL_CTL_ADD) {
+               event->status = RTE_EPOLL_VALID;
+               event->fd = fd;  /* ignore fd in event */
+               event->epfd = epfd;
+               ev.data.ptr = (void *)event;
+       }
+
+       ev.events = event->epdata.event;
+       if (epoll_ctl(epfd, op, fd, &ev) < 0) {
+               RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
+                       op, fd, strerror(errno));
+               if (op == EPOLL_CTL_ADD)
+                       /* rollback status when CTL_ADD fail */
+                       event->status = RTE_EPOLL_INVALID;
+               return -1;
+       }
+
+       if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
+               eal_epoll_data_safe_free(event);
+
+       return 0;
+}
+
+int
+rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
+               int op, unsigned int vec, void *data)
+{
+       struct rte_epoll_event *rev;
+       struct rte_epoll_data *epdata;
+       int epfd_op;
+       unsigned int efd_idx;
+       int rc = 0;
+
+       efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
+               (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
+
+       if (!intr_handle || intr_handle->nb_efd == 0 ||
+           efd_idx >= intr_handle->nb_efd) {
+               RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
+               return -EPERM;
+       }
+
+       switch (op) {
+       case RTE_INTR_EVENT_ADD:
+               epfd_op = EPOLL_CTL_ADD;
+               rev = &intr_handle->elist[efd_idx];
+               if (rev->status != RTE_EPOLL_INVALID) {
+                       RTE_LOG(INFO, EAL, "Event already been added.\n");
+                       return -EEXIST;
+               }
+
+               /* attach to intr vector fd */
+               epdata = &rev->epdata;
+               epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
+               epdata->data   = data;
+               epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
+               epdata->cb_arg = (void *)intr_handle;
+               rc = rte_epoll_ctl(epfd, epfd_op,
+                                  intr_handle->efds[efd_idx], rev);
+               if (!rc)
+                       RTE_LOG(DEBUG, EAL,
+                               "efd %d associated with vec %d added on epfd %d"
+                               "\n", rev->fd, vec, epfd);
+               else
+                       rc = -EPERM;
+               break;
+       case RTE_INTR_EVENT_DEL:
+               epfd_op = EPOLL_CTL_DEL;
+               rev = &intr_handle->elist[efd_idx];
+               if (rev->status == RTE_EPOLL_INVALID) {
+                       RTE_LOG(INFO, EAL, "Event does not exist.\n");
+                       return -EPERM;
+               }
+
+               rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
+               if (rc)
+                       rc = -EPERM;
+               break;
+       default:
+               RTE_LOG(ERR, EAL, "event op type mismatch\n");
+               rc = -EPERM;
+       }
+
+       return rc;
+}
+
+void
+rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
+{
+       uint32_t i;
+       struct rte_epoll_event *rev;
+
+       for (i = 0; i < intr_handle->nb_efd; i++) {
+               rev = &intr_handle->elist[i];
+               if (rev->status == RTE_EPOLL_INVALID)
+                       continue;
+               if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
+                       /* force free if the entry valid */
+                       eal_epoll_data_safe_free(rev);
+                       rev->status = RTE_EPOLL_INVALID;
+               }
+       }
+}
+
+int
+rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
+{
+       uint32_t i;
+       int fd;
+       uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+
+       assert(nb_efd != 0);
+
+       if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
+               for (i = 0; i < n; i++) {
+                       fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+                       if (fd < 0) {
+                               RTE_LOG(ERR, EAL,
+                                       "can't setup eventfd, error %i (%s)\n",
+                                       errno, strerror(errno));
+                               return -errno;
+                       }
+                       intr_handle->efds[i] = fd;
+               }
+               intr_handle->nb_efd   = n;
+               intr_handle->max_intr = NB_OTHER_INTR + n;
+       } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
+               /* only check, initialization would be done in vdev driver.*/
+               if (intr_handle->efd_counter_size >
+                   sizeof(union rte_intr_read_buffer)) {
+                       RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
+                       return -EINVAL;
+               }
+       } else {
+               intr_handle->efds[0]  = intr_handle->fd;
+               intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
+               intr_handle->max_intr = NB_OTHER_INTR;
+       }
+
+       return 0;
+}
+
+void
+rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
+{
+       uint32_t i;
+
+       rte_intr_free_epoll_fd(intr_handle);
+       if (intr_handle->max_intr > intr_handle->nb_efd) {
+               for (i = 0; i < intr_handle->nb_efd; i++)
+                       close(intr_handle->efds[i]);
+       }
+       intr_handle->nb_efd = 0;
+       intr_handle->max_intr = 0;
+}
+
+int
+rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
+{
+       return !(!intr_handle->nb_efd);
+}
+
+int
+rte_intr_allow_others(struct rte_intr_handle *intr_handle)
+{
+       if (!rte_intr_dp_is_en(intr_handle))
+               return 1;
+       else
+               return !!(intr_handle->max_intr - intr_handle->nb_efd);
+}
+
+int
+rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
+{
+       if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
+               return 1;
+
+       if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
+               return 1;
+
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_lcore.c b/lib/librte_eal/linux/eal/eal_lcore.c
new file mode 100644 (file)
index 0000000..bc89658
--- /dev/null
@@ -0,0 +1,81 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_filesystem.h"
+#include "eal_thread.h"
+
+#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u"
+#define CORE_ID_FILE "topology/core_id"
+#define NUMA_NODE_PATH "/sys/devices/system/node"
+
+/* Check if a cpu is present by the presence of the cpu information for it */
+int
+eal_cpu_detected(unsigned lcore_id)
+{
+       char path[PATH_MAX];
+       int len = snprintf(path, sizeof(path), SYS_CPU_DIR
+               "/"CORE_ID_FILE, lcore_id);
+       if (len <= 0 || (unsigned)len >= sizeof(path))
+               return 0;
+       if (access(path, F_OK) != 0)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Get CPU socket id (NUMA node) for a logical core.
+ *
+ * This searches each nodeX directories in /sys for the symlink for the given
+ * lcore_id and returns the numa node where the lcore is found. If lcore is not
+ * found on any numa node, returns zero.
+ */
+unsigned
+eal_cpu_socket_id(unsigned lcore_id)
+{
+       unsigned socket;
+
+       for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+               char path[PATH_MAX];
+
+               snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
+                               socket, lcore_id);
+               if (access(path, F_OK) == 0)
+                       return socket;
+       }
+       return 0;
+}
+
+/* Get the cpu core id value from the /sys/.../cpuX core_id value */
+unsigned
+eal_cpu_core_id(unsigned lcore_id)
+{
+       char path[PATH_MAX];
+       unsigned long id;
+
+       int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE);
+       if (len <= 0 || (unsigned)len >= sizeof(path))
+               goto err;
+       if (eal_parse_sysfs_value(path, &id) != 0)
+               goto err;
+       return (unsigned)id;
+
+err:
+       RTE_LOG(ERR, EAL, "Error reading core id value from %s "
+                       "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id);
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_log.c b/lib/librte_eal/linux/eal/eal_log.c
new file mode 100644 (file)
index 0000000..9d02ddd
--- /dev/null
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <sys/queue.h>
+
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_launch.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_spinlock.h>
+#include <rte_log.h>
+
+#include "eal_private.h"
+
+/*
+ * default log function
+ */
+static ssize_t
+console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
+{
+       ssize_t ret;
+
+       /* write on stdout */
+       ret = fwrite(buf, 1, size, stdout);
+       fflush(stdout);
+
+       /* Syslog error levels are from 0 to 7, so subtract 1 to convert */
+       syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf);
+
+       return ret;
+}
+
+static cookie_io_functions_t console_log_func = {
+       .write = console_log_write,
+};
+
+/*
+ * set the log to default function, called during eal init process,
+ * once memzones are available.
+ */
+int
+rte_eal_log_init(const char *id, int facility)
+{
+       FILE *log_stream;
+
+       log_stream = fopencookie(NULL, "w+", console_log_func);
+       if (log_stream == NULL)
+               return -1;
+
+       openlog(id, LOG_NDELAY | LOG_PID, facility);
+
+       eal_log_set_default(log_stream);
+
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_memalloc.c b/lib/librte_eal/linux/eal/eal_memalloc.c
new file mode 100644 (file)
index 0000000..b6fb183
--- /dev/null
@@ -0,0 +1,1685 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2017-2018 Intel Corporation
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
+#include <linux/memfd.h>
+#define MEMFD_SUPPORTED
+#endif
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+#include <linux/falloc.h>
+#include <linux/mman.h> /* for hugetlb-related mmap flags */
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_errno.h>
+#include <rte_memory.h>
+#include <rte_spinlock.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_private.h"
+
+const int anonymous_hugepages_supported =
+#ifdef MAP_HUGE_SHIFT
+               1;
+#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
+#else
+               0;
+#define RTE_MAP_HUGE_SHIFT 26
+#endif
+
+/*
+ * we've already checked memfd support at compile-time, but we also need to
+ * check if we can create hugepage files with memfd.
+ *
+ * also, this is not a constant, because while we may be *compiled* with memfd
+ * hugetlbfs support, we might not be *running* on a system that supports memfd
+ * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
+ * runtime, and fall back to anonymous memory.
+ */
+static int memfd_create_supported =
+#ifdef MFD_HUGETLB
+               1;
+#define RTE_MFD_HUGETLB MFD_HUGETLB
+#else
+               0;
+#define RTE_MFD_HUGETLB 4U
+#endif
+
+/*
+ * not all kernel version support fallocate on hugetlbfs, so fall back to
+ * ftruncate and disallow deallocation if fallocate is not supported.
+ */
+static int fallocate_supported = -1; /* unknown */
+
+/*
+ * we have two modes - single file segments, and file-per-page mode.
+ *
+ * for single-file segments, we need some kind of mechanism to keep track of
+ * which hugepages can be freed back to the system, and which cannot. we cannot
+ * use flock() because they don't allow locking parts of a file, and we cannot
+ * use fcntl() due to issues with their semantics, so we will have to rely on a
+ * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
+ * of per-page lockfiles. we will store the actual segment list fd in the
+ * 'memseg_list_fd' field.
+ *
+ * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
+ * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
+ *
+ * we cannot know how many pages a system will have in advance, but we do know
+ * that they come in lists, and we know lengths of these lists. so, simply store
+ * a malloc'd array of fd's indexed by list and segment index.
+ *
+ * they will be initialized at startup, and filled as we allocate/deallocate
+ * segments.
+ */
+static struct {
+       int *fds; /**< dynamically allocated array of segment lock fd's */
+       int memseg_list_fd; /**< memseg list fd */
+       int len; /**< total length of the array */
+       int count; /**< entries used in an array */
+} fd_list[RTE_MAX_MEMSEG_LISTS];
+
+/** local copy of a memory map, used to synchronize memory hotplug in MP */
+static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+       siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+       return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+       sigset_t mask;
+       struct sigaction action;
+
+       sigemptyset(&mask);
+       sigaddset(&mask, SIGBUS);
+       action.sa_flags = 0;
+       action.sa_mask = mask;
+       action.sa_handler = huge_sigbus_handler;
+
+       huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+       if (huge_need_recover) {
+               sigaction(SIGBUS, &huge_action_old, NULL);
+               huge_need_recover = 0;
+       }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+check_numa(void)
+{
+       bool ret = true;
+       /* Check if kernel supports NUMA. */
+       if (numa_available() != 0) {
+               RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+               ret = false;
+       }
+       return ret;
+}
+
+static void
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
+{
+       RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+       if (get_mempolicy(oldpolicy, oldmask->maskp,
+                         oldmask->size + 1, 0, 0) < 0) {
+               RTE_LOG(ERR, EAL,
+                       "Failed to get current mempolicy: %s. "
+                       "Assuming MPOL_DEFAULT.\n", strerror(errno));
+               *oldpolicy = MPOL_DEFAULT;
+       }
+       RTE_LOG(DEBUG, EAL,
+               "Setting policy MPOL_PREFERRED for socket %d\n",
+               socket_id);
+       numa_set_preferred(socket_id);
+}
+
+static void
+restore_numa(int *oldpolicy, struct bitmask *oldmask)
+{
+       RTE_LOG(DEBUG, EAL,
+               "Restoring previous memory policy: %d\n", *oldpolicy);
+       if (*oldpolicy == MPOL_DEFAULT) {
+               numa_set_localalloc();
+       } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+                                oldmask->size + 1) < 0) {
+               RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+                       strerror(errno));
+               numa_set_localalloc();
+       }
+       numa_free_cpumask(oldmask);
+}
+#endif
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+get_file_size(int fd)
+{
+       struct stat st;
+       if (fstat(fd, &st) < 0)
+               return 0;
+       return st.st_size;
+}
+
+static int
+pagesz_flags(uint64_t page_sz)
+{
+       /* as per mmap() manpage, all page sizes are log2 of page size
+        * shifted by MAP_HUGE_SHIFT
+        */
+       int log2 = rte_log2_u64(page_sz);
+       return log2 << RTE_MAP_HUGE_SHIFT;
+}
+
+/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
+static int lock(int fd, int type)
+{
+       int ret;
+
+       /* flock may be interrupted */
+       do {
+               ret = flock(fd, type | LOCK_NB);
+       } while (ret && errno == EINTR);
+
+       if (ret && errno == EWOULDBLOCK) {
+               /* couldn't lock */
+               return 0;
+       } else if (ret) {
+               RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
+                       __func__, strerror(errno));
+               return -1;
+       }
+       /* lock was successful */
+       return 1;
+}
+
+static int get_segment_lock_fd(int list_idx, int seg_idx)
+{
+       char path[PATH_MAX] = {0};
+       int fd;
+
+       if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
+               return -1;
+       if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
+               return -1;
+
+       fd = fd_list[list_idx].fds[seg_idx];
+       /* does this lock already exist? */
+       if (fd >= 0)
+               return fd;
+
+       eal_get_hugefile_lock_path(path, sizeof(path),
+                       list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+
+       fd = open(path, O_CREAT | O_RDWR, 0660);
+       if (fd < 0) {
+               RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n",
+                       __func__, path, strerror(errno));
+               return -1;
+       }
+       /* take out a read lock */
+       if (lock(fd, LOCK_SH) != 1) {
+               RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n",
+                       __func__, path, strerror(errno));
+               close(fd);
+               return -1;
+       }
+       /* store it for future reference */
+       fd_list[list_idx].fds[seg_idx] = fd;
+       fd_list[list_idx].count++;
+       return fd;
+}
+
+static int unlock_segment(int list_idx, int seg_idx)
+{
+       int fd, ret;
+
+       if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
+               return -1;
+       if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
+               return -1;
+
+       fd = fd_list[list_idx].fds[seg_idx];
+
+       /* upgrade lock to exclusive to see if we can remove the lockfile */
+       ret = lock(fd, LOCK_EX);
+       if (ret == 1) {
+               /* we've succeeded in taking exclusive lock, this lockfile may
+                * be removed.
+                */
+               char path[PATH_MAX] = {0};
+               eal_get_hugefile_lock_path(path, sizeof(path),
+                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+               if (unlink(path)) {
+                       RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n",
+                                       __func__, path, strerror(errno));
+               }
+       }
+       /* we don't want to leak the fd, so even if we fail to lock, close fd
+        * and remove it from list anyway.
+        */
+       close(fd);
+       fd_list[list_idx].fds[seg_idx] = -1;
+       fd_list[list_idx].count--;
+
+       if (ret < 0)
+               return -1;
+       return 0;
+}
+
+static int
+get_seg_memfd(struct hugepage_info *hi __rte_unused,
+               unsigned int list_idx __rte_unused,
+               unsigned int seg_idx __rte_unused)
+{
+#ifdef MEMFD_SUPPORTED
+       int fd;
+       char segname[250]; /* as per manpage, limit is 249 bytes plus null */
+
+       int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
+
+       if (internal_config.single_file_segments) {
+               fd = fd_list[list_idx].memseg_list_fd;
+
+               if (fd < 0) {
+                       snprintf(segname, sizeof(segname), "seg_%i", list_idx);
+                       fd = memfd_create(segname, flags);
+                       if (fd < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       fd_list[list_idx].memseg_list_fd = fd;
+               }
+       } else {
+               fd = fd_list[list_idx].fds[seg_idx];
+
+               if (fd < 0) {
+                       snprintf(segname, sizeof(segname), "seg_%i-%i",
+                                       list_idx, seg_idx);
+                       fd = memfd_create(segname, flags);
+                       if (fd < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       fd_list[list_idx].fds[seg_idx] = fd;
+               }
+       }
+       return fd;
+#endif
+       return -1;
+}
+
+static int
+get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
+               unsigned int list_idx, unsigned int seg_idx)
+{
+       int fd;
+
+       /* for in-memory mode, we only make it here when we're sure we support
+        * memfd, and this is a special case.
+        */
+       if (internal_config.in_memory)
+               return get_seg_memfd(hi, list_idx, seg_idx);
+
+       if (internal_config.single_file_segments) {
+               /* create a hugepage file path */
+               eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
+
+               fd = fd_list[list_idx].memseg_list_fd;
+
+               if (fd < 0) {
+                       fd = open(path, O_CREAT | O_RDWR, 0600);
+                       if (fd < 0) {
+                               RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       /* take out a read lock and keep it indefinitely */
+                       if (lock(fd, LOCK_SH) < 0) {
+                               RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+                                       __func__, strerror(errno));
+                               close(fd);
+                               return -1;
+                       }
+                       fd_list[list_idx].memseg_list_fd = fd;
+               }
+       } else {
+               /* create a hugepage file path */
+               eal_get_hugefile_path(path, buflen, hi->hugedir,
+                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+
+               fd = fd_list[list_idx].fds[seg_idx];
+
+               if (fd < 0) {
+                       fd = open(path, O_CREAT | O_RDWR, 0600);
+                       if (fd < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+                       /* take out a read lock */
+                       if (lock(fd, LOCK_SH) < 0) {
+                               RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
+                                       __func__, strerror(errno));
+                               close(fd);
+                               return -1;
+                       }
+                       fd_list[list_idx].fds[seg_idx] = fd;
+               }
+       }
+       return fd;
+}
+
+static int
+resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
+               uint64_t fa_offset, uint64_t page_sz, bool grow)
+{
+       bool again = false;
+
+       /* in-memory mode is a special case, because we don't need to perform
+        * any locking, and we can be sure that fallocate() is supported.
+        */
+       if (internal_config.in_memory) {
+               int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+                               FALLOC_FL_KEEP_SIZE;
+               int ret;
+
+               /* grow or shrink the file */
+               ret = fallocate(fd, flags, fa_offset, page_sz);
+
+               if (ret < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+                                       __func__,
+                                       strerror(errno));
+                       return -1;
+               }
+               /* increase/decrease total segment count */
+               fd_list[list_idx].count += (grow ? 1 : -1);
+               if (!grow && fd_list[list_idx].count == 0) {
+                       close(fd_list[list_idx].memseg_list_fd);
+                       fd_list[list_idx].memseg_list_fd = -1;
+               }
+               return 0;
+       }
+
+       do {
+               if (fallocate_supported == 0) {
+                       /* we cannot deallocate memory if fallocate() is not
+                        * supported, and hugepage file is already locked at
+                        * creation, so no further synchronization needed.
+                        */
+
+                       if (!grow) {
+                               RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
+                                       __func__);
+                               return -1;
+                       }
+                       uint64_t new_size = fa_offset + page_sz;
+                       uint64_t cur_size = get_file_size(fd);
+
+                       /* fallocate isn't supported, fall back to ftruncate */
+                       if (new_size > cur_size &&
+                                       ftruncate(fd, new_size) < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+                                       __func__, strerror(errno));
+                               return -1;
+                       }
+               } else {
+                       int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
+                                       FALLOC_FL_KEEP_SIZE;
+                       int ret, lock_fd;
+
+                       /* if fallocate() is supported, we need to take out a
+                        * read lock on allocate (to prevent other processes
+                        * from deallocating this page), and take out a write
+                        * lock on deallocate (to ensure nobody else is using
+                        * this page).
+                        *
+                        * read locks on page itself are already taken out at
+                        * file creation, in get_seg_fd().
+                        *
+                        * we cannot rely on simple use of flock() call, because
+                        * we need to be able to lock a section of the file,
+                        * and we cannot use fcntl() locks, because of numerous
+                        * problems with their semantics, so we will use
+                        * deterministically named lock files for each section
+                        * of the file.
+                        *
+                        * if we're shrinking the file, we want to upgrade our
+                        * lock from shared to exclusive.
+                        *
+                        * lock_fd is an fd for a lockfile, not for the segment
+                        * list.
+                        */
+                       lock_fd = get_segment_lock_fd(list_idx, seg_idx);
+
+                       if (!grow) {
+                               /* we are using this lockfile to determine
+                                * whether this particular page is locked, as we
+                                * are in single file segments mode and thus
+                                * cannot use regular flock() to get this info.
+                                *
+                                * we want to try and take out an exclusive lock
+                                * on the lock file to determine if we're the
+                                * last ones using this page, and if not, we
+                                * won't be shrinking it, and will instead exit
+                                * prematurely.
+                                */
+                               ret = lock(lock_fd, LOCK_EX);
+
+                               /* drop the lock on the lockfile, so that even
+                                * if we couldn't shrink the file ourselves, we
+                                * are signalling to other processes that we're
+                                * no longer using this page.
+                                */
+                               if (unlock_segment(list_idx, seg_idx))
+                                       RTE_LOG(ERR, EAL, "Could not unlock segment\n");
+
+                               /* additionally, if this was the last lock on
+                                * this segment list, we can safely close the
+                                * page file fd, so that one of the processes
+                                * could then delete the file after shrinking.
+                                */
+                               if (ret < 1 && fd_list[list_idx].count == 0) {
+                                       close(fd);
+                                       fd_list[list_idx].memseg_list_fd = -1;
+                               }
+
+                               if (ret < 0) {
+                                       RTE_LOG(ERR, EAL, "Could not lock segment\n");
+                                       return -1;
+                               }
+                               if (ret == 0)
+                                       /* failed to lock, not an error. */
+                                       return 0;
+                       }
+
+                       /* grow or shrink the file */
+                       ret = fallocate(fd, flags, fa_offset, page_sz);
+
+                       if (ret < 0) {
+                               if (fallocate_supported == -1 &&
+                                               errno == ENOTSUP) {
+                                       RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
+                                               __func__);
+                                       again = true;
+                                       fallocate_supported = 0;
+                               } else {
+                                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+                                               __func__,
+                                               strerror(errno));
+                                       return -1;
+                               }
+                       } else {
+                               fallocate_supported = 1;
+
+                               /* we've grew/shrunk the file, and we hold an
+                                * exclusive lock now. check if there are no
+                                * more segments active in this segment list,
+                                * and remove the file if there aren't.
+                                */
+                               if (fd_list[list_idx].count == 0) {
+                                       if (unlink(path))
+                                               RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
+                                                       __func__, path,
+                                                       strerror(errno));
+                                       close(fd);
+                                       fd_list[list_idx].memseg_list_fd = -1;
+                               }
+                       }
+               }
+       } while (again);
+       return 0;
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
+               struct hugepage_info *hi, unsigned int list_idx,
+               unsigned int seg_idx)
+{
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       int cur_socket_id = 0;
+#endif
+       uint64_t map_offset;
+       rte_iova_t iova;
+       void *va;
+       char path[PATH_MAX];
+       int ret = 0;
+       int fd;
+       size_t alloc_sz;
+       int flags;
+       void *new_addr;
+
+       alloc_sz = hi->hugepage_sz;
+
+       /* these are checked at init, but code analyzers don't know that */
+       if (internal_config.in_memory && !anonymous_hugepages_supported) {
+               RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
+               return -1;
+       }
+       if (internal_config.in_memory && !memfd_create_supported &&
+                       internal_config.single_file_segments) {
+               RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
+               return -1;
+       }
+
+       /* in-memory without memfd is a special case */
+       int mmap_flags;
+
+       if (internal_config.in_memory && !memfd_create_supported) {
+               const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
+                               MAP_PRIVATE | MAP_ANONYMOUS;
+               int pagesz_flag;
+
+               pagesz_flag = pagesz_flags(alloc_sz);
+               fd = -1;
+               mmap_flags = in_memory_flags | pagesz_flag;
+
+               /* single-file segments codepath will never be active
+                * here because in-memory mode is incompatible with the
+                * fallback path, and it's stopped at EAL initialization
+                * stage.
+                */
+               map_offset = 0;
+       } else {
+               /* takes out a read lock on segment or segment list */
+               fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+               if (fd < 0) {
+                       RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
+                       return -1;
+               }
+
+               if (internal_config.single_file_segments) {
+                       map_offset = seg_idx * alloc_sz;
+                       ret = resize_hugefile(fd, path, list_idx, seg_idx,
+                                       map_offset, alloc_sz, true);
+                       if (ret < 0)
+                               goto resized;
+               } else {
+                       map_offset = 0;
+                       if (ftruncate(fd, alloc_sz) < 0) {
+                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+                                       __func__, strerror(errno));
+                               goto resized;
+                       }
+                       if (internal_config.hugepage_unlink &&
+                                       !internal_config.in_memory) {
+                               if (unlink(path)) {
+                                       RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
+                                               __func__, strerror(errno));
+                                       goto resized;
+                               }
+                       }
+               }
+               mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
+       }
+
+       /*
+        * map the segment, and populate page tables, the kernel fills
+        * this segment with zeros if it's a new page.
+        */
+       va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
+                       map_offset);
+
+       if (va == MAP_FAILED) {
+               RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+                       strerror(errno));
+               /* mmap failed, but the previous region might have been
+                * unmapped anyway. try to remap it
+                */
+               goto unmapped;
+       }
+       if (va != addr) {
+               RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+               munmap(va, alloc_sz);
+               goto resized;
+       }
+
+       /* In linux, hugetlb limitations, like cgroup, are
+        * enforced at fault time instead of mmap(), even
+        * with the option of MAP_POPULATE. Kernel will send
+        * a SIGBUS signal. To avoid to be killed, save stack
+        * environment here, if SIGBUS happens, we can jump
+        * back here.
+        */
+       if (huge_wrap_sigsetjmp()) {
+               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+                       (unsigned int)(alloc_sz >> 20));
+               goto mapped;
+       }
+
+       /* we need to trigger a write to the page to enforce page fault and
+        * ensure that page is accessible to us, but we can't overwrite value
+        * that is already there, so read the old value, and write itback.
+        * kernel populates the page with zeroes initially.
+        */
+       *(volatile int *)addr = *(volatile int *)addr;
+
+       iova = rte_mem_virt2iova(addr);
+       if (iova == RTE_BAD_PHYS_ADDR) {
+               RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+                       __func__);
+               goto mapped;
+       }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+       if (cur_socket_id != socket_id) {
+               RTE_LOG(DEBUG, EAL,
+                               "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+                       __func__, socket_id, cur_socket_id);
+               goto mapped;
+       }
+#endif
+
+       ms->addr = addr;
+       ms->hugepage_sz = alloc_sz;
+       ms->len = alloc_sz;
+       ms->nchannel = rte_memory_get_nchannel();
+       ms->nrank = rte_memory_get_nrank();
+       ms->iova = iova;
+       ms->socket_id = socket_id;
+
+       return 0;
+
+mapped:
+       munmap(addr, alloc_sz);
+unmapped:
+       flags = MAP_FIXED;
+       new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
+       if (new_addr != addr) {
+               if (new_addr != NULL)
+                       munmap(new_addr, alloc_sz);
+               /* we're leaving a hole in our virtual address space. if
+                * somebody else maps this hole now, we could accidentally
+                * override it in the future.
+                */
+               RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
+       }
+resized:
+       /* some codepaths will return negative fd, so exit early */
+       if (fd < 0)
+               return -1;
+
+       if (internal_config.single_file_segments) {
+               resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+                               alloc_sz, false);
+               /* ignore failure, can't make it any worse */
+       } else {
+               /* only remove file if we can take out a write lock */
+               if (internal_config.hugepage_unlink == 0 &&
+                               internal_config.in_memory == 0 &&
+                               lock(fd, LOCK_EX) == 1)
+                       unlink(path);
+               close(fd);
+               fd_list[list_idx].fds[seg_idx] = -1;
+       }
+       return -1;
+}
+
+static int
+free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
+               unsigned int list_idx, unsigned int seg_idx)
+{
+       uint64_t map_offset;
+       char path[PATH_MAX];
+       int fd, ret = 0;
+       bool exit_early;
+
+       /* erase page data */
+       memset(ms->addr, 0, ms->len);
+
+       if (mmap(ms->addr, ms->len, PROT_READ,
+                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
+                               MAP_FAILED) {
+               RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
+               return -1;
+       }
+
+       exit_early = false;
+
+       /* if we're using anonymous hugepages, nothing to be done */
+       if (internal_config.in_memory && !memfd_create_supported)
+               exit_early = true;
+
+       /* if we've already unlinked the page, nothing needs to be done */
+       if (!internal_config.in_memory && internal_config.hugepage_unlink)
+               exit_early = true;
+
+       if (exit_early) {
+               memset(ms, 0, sizeof(*ms));
+               return 0;
+       }
+
+       /* if we are not in single file segments mode, we're going to unmap the
+        * segment and thus drop the lock on original fd, but hugepage dir is
+        * now locked so we can take out another one without races.
+        */
+       fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+       if (fd < 0)
+               return -1;
+
+       if (internal_config.single_file_segments) {
+               map_offset = seg_idx * ms->len;
+               if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
+                               ms->len, false))
+                       return -1;
+               ret = 0;
+       } else {
+               /* if we're able to take out a write lock, we're the last one
+                * holding onto this page.
+                */
+               if (!internal_config.in_memory) {
+                       ret = lock(fd, LOCK_EX);
+                       if (ret >= 0) {
+                               /* no one else is using this page */
+                               if (ret == 1)
+                                       unlink(path);
+                       }
+               }
+               /* closing fd will drop the lock */
+               close(fd);
+               fd_list[list_idx].fds[seg_idx] = -1;
+       }
+
+       memset(ms, 0, sizeof(*ms));
+
+       return ret < 0 ? -1 : 0;
+}
+
+struct alloc_walk_param {
+       struct hugepage_info *hi;
+       struct rte_memseg **ms;
+       size_t page_sz;
+       unsigned int segs_allocated;
+       unsigned int n_segs;
+       int socket;
+       bool exact;
+};
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct alloc_walk_param *wa = arg;
+       struct rte_memseg_list *cur_msl;
+       size_t page_sz;
+       int cur_idx, start_idx, j, dir_fd = -1;
+       unsigned int msl_idx, need, i;
+
+       if (msl->page_sz != wa->page_sz)
+               return 0;
+       if (msl->socket_id != wa->socket)
+               return 0;
+
+       page_sz = (size_t)msl->page_sz;
+
+       msl_idx = msl - mcfg->memsegs;
+       cur_msl = &mcfg->memsegs[msl_idx];
+
+       need = wa->n_segs;
+
+       /* try finding space in memseg list */
+       cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
+       if (cur_idx < 0)
+               return 0;
+       start_idx = cur_idx;
+
+       /* do not allow any page allocations during the time we're allocating,
+        * because file creation and locking operations are not atomic,
+        * and we might be the first or the last ones to use a particular page,
+        * so we need to ensure atomicity of every operation.
+        *
+        * during init, we already hold a write lock, so don't try to take out
+        * another one.
+        */
+       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
+               dir_fd = open(wa->hi->hugedir, O_RDONLY);
+               if (dir_fd < 0) {
+                       RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+                               __func__, wa->hi->hugedir, strerror(errno));
+                       return -1;
+               }
+               /* blocking writelock */
+               if (flock(dir_fd, LOCK_EX)) {
+                       RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+                               __func__, wa->hi->hugedir, strerror(errno));
+                       close(dir_fd);
+                       return -1;
+               }
+       }
+
+       for (i = 0; i < need; i++, cur_idx++) {
+               struct rte_memseg *cur;
+               void *map_addr;
+
+               cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+               map_addr = RTE_PTR_ADD(cur_msl->base_va,
+                               cur_idx * page_sz);
+
+               if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
+                               msl_idx, cur_idx)) {
+                       RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
+                               need, i);
+
+                       /* if exact number wasn't requested, stop */
+                       if (!wa->exact)
+                               goto out;
+
+                       /* clean up */
+                       for (j = start_idx; j < cur_idx; j++) {
+                               struct rte_memseg *tmp;
+                               struct rte_fbarray *arr =
+                                               &cur_msl->memseg_arr;
+
+                               tmp = rte_fbarray_get(arr, j);
+                               rte_fbarray_set_free(arr, j);
+
+                               /* free_seg may attempt to create a file, which
+                                * may fail.
+                                */
+                               if (free_seg(tmp, wa->hi, msl_idx, j))
+                                       RTE_LOG(DEBUG, EAL, "Cannot free page\n");
+                       }
+                       /* clear the list */
+                       if (wa->ms)
+                               memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+                       if (dir_fd >= 0)
+                               close(dir_fd);
+                       return -1;
+               }
+               if (wa->ms)
+                       wa->ms[i] = cur;
+
+               rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+       }
+out:
+       wa->segs_allocated = i;
+       if (i > 0)
+               cur_msl->version++;
+       if (dir_fd >= 0)
+               close(dir_fd);
+       return 1;
+}
+
+struct free_walk_param {
+       struct hugepage_info *hi;
+       struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *found_msl;
+       struct free_walk_param *wa = arg;
+       uintptr_t start_addr, end_addr;
+       int msl_idx, seg_idx, ret, dir_fd = -1;
+
+       start_addr = (uintptr_t) msl->base_va;
+       end_addr = start_addr + msl->len;
+
+       if ((uintptr_t)wa->ms->addr < start_addr ||
+                       (uintptr_t)wa->ms->addr >= end_addr)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+       /* msl is const */
+       found_msl = &mcfg->memsegs[msl_idx];
+
+       /* do not allow any page allocations during the time we're freeing,
+        * because file creation and locking operations are not atomic,
+        * and we might be the first or the last ones to use a particular page,
+        * so we need to ensure atomicity of every operation.
+        *
+        * during init, we already hold a write lock, so don't try to take out
+        * another one.
+        */
+       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
+               dir_fd = open(wa->hi->hugedir, O_RDONLY);
+               if (dir_fd < 0) {
+                       RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
+                               __func__, wa->hi->hugedir, strerror(errno));
+                       return -1;
+               }
+               /* blocking writelock */
+               if (flock(dir_fd, LOCK_EX)) {
+                       RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
+                               __func__, wa->hi->hugedir, strerror(errno));
+                       close(dir_fd);
+                       return -1;
+               }
+       }
+
+       found_msl->version++;
+
+       rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
+       ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
+
+       if (dir_fd >= 0)
+               close(dir_fd);
+
+       if (ret < 0)
+               return -1;
+
+       return 1;
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
+               int socket, bool exact)
+{
+       int i, ret = -1;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       bool have_numa = false;
+       int oldpolicy;
+       struct bitmask *oldmask;
+#endif
+       struct alloc_walk_param wa;
+       struct hugepage_info *hi = NULL;
+
+       memset(&wa, 0, sizeof(wa));
+
+       /* dynamic allocation not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+               if (page_sz ==
+                               internal_config.hugepage_info[i].hugepage_sz) {
+                       hi = &internal_config.hugepage_info[i];
+                       break;
+               }
+       }
+       if (!hi) {
+               RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+                       __func__);
+               return -1;
+       }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       if (check_numa()) {
+               oldmask = numa_allocate_nodemask();
+               prepare_numa(&oldpolicy, oldmask, socket);
+               have_numa = true;
+       }
+#endif
+
+       wa.exact = exact;
+       wa.hi = hi;
+       wa.ms = ms;
+       wa.n_segs = n_segs;
+       wa.page_sz = page_sz;
+       wa.socket = socket;
+       wa.segs_allocated = 0;
+
+       /* memalloc is locked, so it's safe to use thread-unsafe version */
+       ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
+       if (ret == 0) {
+               RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+                       __func__);
+               ret = -1;
+       } else if (ret > 0) {
+               ret = (int)wa.segs_allocated;
+       }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       if (have_numa)
+               restore_numa(&oldpolicy, oldmask);
+#endif
+       return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+       struct rte_memseg *ms;
+       if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
+               return NULL;
+       /* return pointer to newly allocated memseg */
+       return ms;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+       int seg, ret = 0;
+
+       /* dynamic free not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       for (seg = 0; seg < n_segs; seg++) {
+               struct rte_memseg *cur = ms[seg];
+               struct hugepage_info *hi = NULL;
+               struct free_walk_param wa;
+               int i, walk_res;
+
+               /* if this page is marked as unfreeable, fail */
+               if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+                       RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+                       ret = -1;
+                       continue;
+               }
+
+               memset(&wa, 0, sizeof(wa));
+
+               for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
+                               i++) {
+                       hi = &internal_config.hugepage_info[i];
+                       if (cur->hugepage_sz == hi->hugepage_sz)
+                               break;
+               }
+               if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
+                       RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+                       ret = -1;
+                       continue;
+               }
+
+               wa.ms = cur;
+               wa.hi = hi;
+
+               /* memalloc is locked, so it's safe to use thread-unsafe version
+                */
+               walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
+                               &wa);
+               if (walk_res == 1)
+                       continue;
+               if (walk_res == 0)
+                       RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+               ret = -1;
+       }
+       return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+       /* dynamic free not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       return eal_memalloc_free_seg_bulk(&ms, 1);
+}
+
+static int
+sync_chunk(struct rte_memseg_list *primary_msl,
+               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+               unsigned int msl_idx, bool used, int start, int end)
+{
+       struct rte_fbarray *l_arr, *p_arr;
+       int i, ret, chunk_len, diff_len;
+
+       l_arr = &local_msl->memseg_arr;
+       p_arr = &primary_msl->memseg_arr;
+
+       /* we need to aggregate allocations/deallocations into bigger chunks,
+        * as we don't want to spam the user with per-page callbacks.
+        *
+        * to avoid any potential issues, we also want to trigger
+        * deallocation callbacks *before* we actually deallocate
+        * memory, so that the user application could wrap up its use
+        * before it goes away.
+        */
+
+       chunk_len = end - start;
+
+       /* find how many contiguous pages we can map/unmap for this chunk */
+       diff_len = used ?
+                       rte_fbarray_find_contig_free(l_arr, start) :
+                       rte_fbarray_find_contig_used(l_arr, start);
+
+       /* has to be at least one page */
+       if (diff_len < 1)
+               return -1;
+
+       diff_len = RTE_MIN(chunk_len, diff_len);
+
+       /* if we are freeing memory, notify the application */
+       if (!used) {
+               struct rte_memseg *ms;
+               void *start_va;
+               size_t len, page_sz;
+
+               ms = rte_fbarray_get(l_arr, start);
+               start_va = ms->addr;
+               page_sz = (size_t)primary_msl->page_sz;
+               len = page_sz * diff_len;
+
+               eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
+                               start_va, len);
+       }
+
+       for (i = 0; i < diff_len; i++) {
+               struct rte_memseg *p_ms, *l_ms;
+               int seg_idx = start + i;
+
+               l_ms = rte_fbarray_get(l_arr, seg_idx);
+               p_ms = rte_fbarray_get(p_arr, seg_idx);
+
+               if (l_ms == NULL || p_ms == NULL)
+                       return -1;
+
+               if (used) {
+                       ret = alloc_seg(l_ms, p_ms->addr,
+                                       p_ms->socket_id, hi,
+                                       msl_idx, seg_idx);
+                       if (ret < 0)
+                               return -1;
+                       rte_fbarray_set_used(l_arr, seg_idx);
+               } else {
+                       ret = free_seg(l_ms, hi, msl_idx, seg_idx);
+                       rte_fbarray_set_free(l_arr, seg_idx);
+                       if (ret < 0)
+                               return -1;
+               }
+       }
+
+       /* if we just allocated memory, notify the application */
+       if (used) {
+               struct rte_memseg *ms;
+               void *start_va;
+               size_t len, page_sz;
+
+               ms = rte_fbarray_get(l_arr, start);
+               start_va = ms->addr;
+               page_sz = (size_t)primary_msl->page_sz;
+               len = page_sz * diff_len;
+
+               eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
+                               start_va, len);
+       }
+
+       /* calculate how much we can advance until next chunk */
+       diff_len = used ?
+                       rte_fbarray_find_contig_used(l_arr, start) :
+                       rte_fbarray_find_contig_free(l_arr, start);
+       ret = RTE_MIN(chunk_len, diff_len);
+
+       return ret;
+}
+
+static int
+sync_status(struct rte_memseg_list *primary_msl,
+               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+               unsigned int msl_idx, bool used)
+{
+       struct rte_fbarray *l_arr, *p_arr;
+       int p_idx, l_chunk_len, p_chunk_len, ret;
+       int start, end;
+
+       /* this is a little bit tricky, but the basic idea is - walk both lists
+        * and spot any places where there are discrepancies. walking both lists
+        * and noting discrepancies in a single go is a hard problem, so we do
+        * it in two passes - first we spot any places where allocated segments
+        * mismatch (i.e. ensure that everything that's allocated in the primary
+        * is also allocated in the secondary), and then we do it by looking at
+        * free segments instead.
+        *
+        * we also need to aggregate changes into chunks, as we have to call
+        * callbacks per allocation, not per page.
+        */
+       l_arr = &local_msl->memseg_arr;
+       p_arr = &primary_msl->memseg_arr;
+
+       if (used)
+               p_idx = rte_fbarray_find_next_used(p_arr, 0);
+       else
+               p_idx = rte_fbarray_find_next_free(p_arr, 0);
+
+       while (p_idx >= 0) {
+               int next_chunk_search_idx;
+
+               if (used) {
+                       p_chunk_len = rte_fbarray_find_contig_used(p_arr,
+                                       p_idx);
+                       l_chunk_len = rte_fbarray_find_contig_used(l_arr,
+                                       p_idx);
+               } else {
+                       p_chunk_len = rte_fbarray_find_contig_free(p_arr,
+                                       p_idx);
+                       l_chunk_len = rte_fbarray_find_contig_free(l_arr,
+                                       p_idx);
+               }
+               /* best case scenario - no differences (or bigger, which will be
+                * fixed during next iteration), look for next chunk
+                */
+               if (l_chunk_len >= p_chunk_len) {
+                       next_chunk_search_idx = p_idx + p_chunk_len;
+                       goto next_chunk;
+               }
+
+               /* if both chunks start at the same point, skip parts we know
+                * are identical, and sync the rest. each call to sync_chunk
+                * will only sync contiguous segments, so we need to call this
+                * until we are sure there are no more differences in this
+                * chunk.
+                */
+               start = p_idx + l_chunk_len;
+               end = p_idx + p_chunk_len;
+               do {
+                       ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
+                                       used, start, end);
+                       start += ret;
+               } while (start < end && ret >= 0);
+               /* if ret is negative, something went wrong */
+               if (ret < 0)
+                       return -1;
+
+               next_chunk_search_idx = p_idx + p_chunk_len;
+next_chunk:
+               /* skip to end of this chunk */
+               if (used) {
+                       p_idx = rte_fbarray_find_next_used(p_arr,
+                                       next_chunk_search_idx);
+               } else {
+                       p_idx = rte_fbarray_find_next_free(p_arr,
+                                       next_chunk_search_idx);
+               }
+       }
+       return 0;
+}
+
+static int
+sync_existing(struct rte_memseg_list *primary_msl,
+               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
+               unsigned int msl_idx)
+{
+       int ret, dir_fd;
+
+       /* do not allow any page allocations during the time we're allocating,
+        * because file creation and locking operations are not atomic,
+        * and we might be the first or the last ones to use a particular page,
+        * so we need to ensure atomicity of every operation.
+        */
+       dir_fd = open(hi->hugedir, O_RDONLY);
+       if (dir_fd < 0) {
+               RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
+                       hi->hugedir, strerror(errno));
+               return -1;
+       }
+       /* blocking writelock */
+       if (flock(dir_fd, LOCK_EX)) {
+               RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
+                       hi->hugedir, strerror(errno));
+               close(dir_fd);
+               return -1;
+       }
+
+       /* ensure all allocated space is the same in both lists */
+       ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
+       if (ret < 0)
+               goto fail;
+
+       /* ensure all unallocated space is the same in both lists */
+       ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
+       if (ret < 0)
+               goto fail;
+
+       /* update version number */
+       local_msl->version = primary_msl->version;
+
+       close(dir_fd);
+
+       return 0;
+fail:
+       close(dir_fd);
+       return -1;
+}
+
+static int
+sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *primary_msl, *local_msl;
+       struct hugepage_info *hi = NULL;
+       unsigned int i;
+       int msl_idx;
+
+       if (msl->external)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       primary_msl = &mcfg->memsegs[msl_idx];
+       local_msl = &local_memsegs[msl_idx];
+
+       for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+               uint64_t cur_sz =
+                       internal_config.hugepage_info[i].hugepage_sz;
+               uint64_t msl_sz = primary_msl->page_sz;
+               if (msl_sz == cur_sz) {
+                       hi = &internal_config.hugepage_info[i];
+                       break;
+               }
+       }
+       if (!hi) {
+               RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+               return -1;
+       }
+
+       /* if versions don't match, synchronize everything */
+       if (local_msl->version != primary_msl->version &&
+                       sync_existing(primary_msl, local_msl, hi, msl_idx))
+               return -1;
+       return 0;
+}
+
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+       /* nothing to be done in primary */
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+               return 0;
+
+       /* memalloc is locked, so it's safe to call thread-unsafe version */
+       if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
+               return -1;
+       return 0;
+}
+
+static int
+secondary_msl_create_walk(const struct rte_memseg_list *msl,
+               void *arg __rte_unused)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *primary_msl, *local_msl;
+       char name[PATH_MAX];
+       int msl_idx, ret;
+
+       if (msl->external)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       primary_msl = &mcfg->memsegs[msl_idx];
+       local_msl = &local_memsegs[msl_idx];
+
+       /* create distinct fbarrays for each secondary */
+       snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
+               primary_msl->memseg_arr.name, getpid());
+
+       ret = rte_fbarray_init(&local_msl->memseg_arr, name,
+               primary_msl->memseg_arr.len,
+               primary_msl->memseg_arr.elt_sz);
+       if (ret < 0) {
+               RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
+               return -1;
+       }
+       local_msl->base_va = primary_msl->base_va;
+       local_msl->len = primary_msl->len;
+
+       return 0;
+}
+
+static int
+alloc_list(int list_idx, int len)
+{
+       int *data;
+       int i;
+
+       /* ensure we have space to store fd per each possible segment */
+       data = malloc(sizeof(int) * len);
+       if (data == NULL) {
+               RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
+               return -1;
+       }
+       /* set all fd's as invalid */
+       for (i = 0; i < len; i++)
+               data[i] = -1;
+
+       fd_list[list_idx].fds = data;
+       fd_list[list_idx].len = len;
+       fd_list[list_idx].count = 0;
+       fd_list[list_idx].memseg_list_fd = -1;
+
+       return 0;
+}
+
+static int
+fd_list_create_walk(const struct rte_memseg_list *msl,
+               void *arg __rte_unused)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       unsigned int len;
+       int msl_idx;
+
+       if (msl->external)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       len = msl->memseg_arr.len;
+
+       return alloc_list(msl_idx, len);
+}
+
+int
+eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+       /* single file segments mode doesn't support individual segment fd's */
+       if (internal_config.single_file_segments)
+               return -ENOTSUP;
+
+       /* if list is not allocated, allocate it */
+       if (fd_list[list_idx].len == 0) {
+               int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+               if (alloc_list(list_idx, len) < 0)
+                       return -ENOMEM;
+       }
+       fd_list[list_idx].fds[seg_idx] = fd;
+
+       return 0;
+}
+
+int
+eal_memalloc_set_seg_list_fd(int list_idx, int fd)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+       /* non-single file segment mode doesn't support segment list fd's */
+       if (!internal_config.single_file_segments)
+               return -ENOTSUP;
+
+       /* if list is not allocated, allocate it */
+       if (fd_list[list_idx].len == 0) {
+               int len = mcfg->memsegs[list_idx].memseg_arr.len;
+
+               if (alloc_list(list_idx, len) < 0)
+                       return -ENOMEM;
+       }
+
+       fd_list[list_idx].memseg_list_fd = fd;
+
+       return 0;
+}
+
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+       int fd;
+
+       if (internal_config.in_memory || internal_config.no_hugetlbfs) {
+#ifndef MEMFD_SUPPORTED
+               /* in in-memory or no-huge mode, we rely on memfd support */
+               return -ENOTSUP;
+#endif
+               /* memfd supported, but hugetlbfs memfd may not be */
+               if (!internal_config.no_hugetlbfs && !memfd_create_supported)
+                       return -ENOTSUP;
+       }
+
+       if (internal_config.single_file_segments) {
+               fd = fd_list[list_idx].memseg_list_fd;
+       } else if (fd_list[list_idx].len == 0) {
+               /* list not initialized */
+               fd = -1;
+       } else {
+               fd = fd_list[list_idx].fds[seg_idx];
+       }
+       if (fd < 0)
+               return -ENODEV;
+       return fd;
+}
+
+static int
+test_memfd_create(void)
+{
+#ifdef MEMFD_SUPPORTED
+       unsigned int i;
+       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+               uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
+               int pagesz_flag = pagesz_flags(pagesz);
+               int flags;
+
+               flags = pagesz_flag | RTE_MFD_HUGETLB;
+               int fd = memfd_create("test", flags);
+               if (fd < 0) {
+                       /* we failed - let memalloc know this isn't working */
+                       if (errno == EINVAL) {
+                               memfd_create_supported = 0;
+                               return 0; /* not supported */
+                       }
+
+                       /* we got other error - something's wrong */
+                       return -1; /* error */
+               }
+               close(fd);
+               return 1; /* supported */
+       }
+#endif
+       return 0; /* not supported */
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+
+       if (internal_config.in_memory || internal_config.no_hugetlbfs) {
+#ifndef MEMFD_SUPPORTED
+               /* in in-memory or no-huge mode, we rely on memfd support */
+               return -ENOTSUP;
+#endif
+               /* memfd supported, but hugetlbfs memfd may not be */
+               if (!internal_config.no_hugetlbfs && !memfd_create_supported)
+                       return -ENOTSUP;
+       }
+
+       /* fd_list not initialized? */
+       if (fd_list[list_idx].len == 0)
+               return -ENODEV;
+       if (internal_config.single_file_segments) {
+               size_t pgsz = mcfg->memsegs[list_idx].page_sz;
+
+               /* segment not active? */
+               if (fd_list[list_idx].memseg_list_fd < 0)
+                       return -ENOENT;
+               *offset = pgsz * seg_idx;
+       } else {
+               /* segment not active? */
+               if (fd_list[list_idx].fds[seg_idx] < 0)
+                       return -ENOENT;
+               *offset = 0;
+       }
+       return 0;
+}
+
+int
+eal_memalloc_init(void)
+{
+       if (rte_eal_process_type() == RTE_PROC_SECONDARY)
+               if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
+                       return -1;
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
+                       internal_config.in_memory) {
+               int mfd_res = test_memfd_create();
+
+               if (mfd_res < 0) {
+                       RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
+                       return -1;
+               }
+               if (mfd_res == 1)
+                       RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+               else
+                       RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
+
+               /* we only support single-file segments mode with in-memory mode
+                * if we support hugetlbfs with memfd_create. this code will
+                * test if we do.
+                */
+               if (internal_config.single_file_segments &&
+                               mfd_res != 1) {
+                       RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
+                       return -1;
+               }
+               /* this cannot ever happen but better safe than sorry */
+               if (!anonymous_hugepages_supported) {
+                       RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
+                       return -1;
+               }
+       }
+
+       /* initialize all of the fd lists */
+       if (rte_memseg_list_walk(fd_list_create_walk, NULL))
+               return -1;
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_memory.c b/lib/librte_eal/linux/eal/eal_memory.c
new file mode 100644 (file)
index 0000000..1b96b57
--- /dev/null
@@ -0,0 +1,2439 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation.
+ * Copyright(c) 2013 6WIND S.A.
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include <limits.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
+#include <linux/memfd.h>
+#define MEMFD_SUPPORTED
+#endif
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_common.h>
+#include <rte_string_fns.h>
+
+#include "eal_private.h"
+#include "eal_memalloc.h"
+#include "eal_internal_cfg.h"
+#include "eal_filesystem.h"
+#include "eal_hugepages.h"
+
+#define PFN_MASK_SIZE  8
+
+/**
+ * @file
+ * Huge page mapping under linux
+ *
+ * To reserve a big contiguous amount of memory, we use the hugepage
+ * feature of linux. For that, we need to have hugetlbfs mounted. This
+ * code will create many files in this directory (one per page) and
+ * map them in virtual memory. For each page, we will retrieve its
+ * physical address and remap it in order to have a virtual contiguous
+ * zone as well as a physical contiguous zone.
+ */
+
+static bool phys_addrs_available = true;
+
+#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
+
+static void
+test_phys_addrs_available(void)
+{
+       uint64_t tmp = 0;
+       phys_addr_t physaddr;
+
+       if (!rte_eal_has_hugepages()) {
+               RTE_LOG(ERR, EAL,
+                       "Started without hugepages support, physical addresses not available\n");
+               phys_addrs_available = false;
+               return;
+       }
+
+       physaddr = rte_mem_virt2phy(&tmp);
+       if (physaddr == RTE_BAD_PHYS_ADDR) {
+               if (rte_eal_iova_mode() == RTE_IOVA_PA)
+                       RTE_LOG(ERR, EAL,
+                               "Cannot obtain physical addresses: %s. "
+                               "Only vfio will function.\n",
+                               strerror(errno));
+               phys_addrs_available = false;
+       }
+}
+
+/*
+ * Get physical address of any mapped virtual address in the current process.
+ */
+phys_addr_t
+rte_mem_virt2phy(const void *virtaddr)
+{
+       int fd, retval;
+       uint64_t page, physaddr;
+       unsigned long virt_pfn;
+       int page_size;
+       off_t offset;
+
+       /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
+       if (!phys_addrs_available)
+               return RTE_BAD_IOVA;
+
+       /* standard page size */
+       page_size = getpagesize();
+
+       fd = open("/proc/self/pagemap", O_RDONLY);
+       if (fd < 0) {
+               RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
+                       __func__, strerror(errno));
+               return RTE_BAD_IOVA;
+       }
+
+       virt_pfn = (unsigned long)virtaddr / page_size;
+       offset = sizeof(uint64_t) * virt_pfn;
+       if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
+               RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
+                               __func__, strerror(errno));
+               close(fd);
+               return RTE_BAD_IOVA;
+       }
+
+       retval = read(fd, &page, PFN_MASK_SIZE);
+       close(fd);
+       if (retval < 0) {
+               RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
+                               __func__, strerror(errno));
+               return RTE_BAD_IOVA;
+       } else if (retval != PFN_MASK_SIZE) {
+               RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap "
+                               "but expected %d:\n",
+                               __func__, retval, PFN_MASK_SIZE);
+               return RTE_BAD_IOVA;
+       }
+
+       /*
+        * the pfn (page frame number) are bits 0-54 (see
+        * pagemap.txt in linux Documentation)
+        */
+       if ((page & 0x7fffffffffffffULL) == 0)
+               return RTE_BAD_IOVA;
+
+       physaddr = ((page & 0x7fffffffffffffULL) * page_size)
+               + ((unsigned long)virtaddr % page_size);
+
+       return physaddr;
+}
+
+rte_iova_t
+rte_mem_virt2iova(const void *virtaddr)
+{
+       if (rte_eal_iova_mode() == RTE_IOVA_VA)
+               return (uintptr_t)virtaddr;
+       return rte_mem_virt2phy(virtaddr);
+}
+
+/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value. We find
+ * it by browsing the /proc/self/pagemap special file.
+ */
+static int
+find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+       unsigned int i;
+       phys_addr_t addr;
+
+       for (i = 0; i < hpi->num_pages[0]; i++) {
+               addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
+               if (addr == RTE_BAD_PHYS_ADDR)
+                       return -1;
+               hugepg_tbl[i].physaddr = addr;
+       }
+       return 0;
+}
+
+/*
+ * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
+ */
+static int
+set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+       unsigned int i;
+       static phys_addr_t addr;
+
+       for (i = 0; i < hpi->num_pages[0]; i++) {
+               hugepg_tbl[i].physaddr = addr;
+               addr += hugepg_tbl[i].size;
+       }
+       return 0;
+}
+
+/*
+ * Check whether address-space layout randomization is enabled in
+ * the kernel. This is important for multi-process as it can prevent
+ * two processes mapping data to the same virtual address
+ * Returns:
+ *    0 - address space randomization disabled
+ *    1/2 - address space randomization enabled
+ *    negative error code on error
+ */
+static int
+aslr_enabled(void)
+{
+       char c;
+       int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
+       if (fd < 0)
+               return -errno;
+       retval = read(fd, &c, 1);
+       close(fd);
+       if (retval < 0)
+               return -errno;
+       if (retval == 0)
+               return -EIO;
+       switch (c) {
+               case '0' : return 0;
+               case '1' : return 1;
+               case '2' : return 2;
+               default: return -EINVAL;
+       }
+}
+
+static sigjmp_buf huge_jmpenv;
+
+static void huge_sigbus_handler(int signo __rte_unused)
+{
+       siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int huge_wrap_sigsetjmp(void)
+{
+       return sigsetjmp(huge_jmpenv, 1);
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+/* Callback for numa library. */
+void numa_error(char *where)
+{
+       RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
+}
+#endif
+
+/*
+ * Mmap all hugepages of hugepage table: it first open a file in
+ * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
+ * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
+ * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
+ * map contiguous physical blocks in contiguous virtual blocks.
+ */
+static unsigned
+map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
+                 uint64_t *essential_memory __rte_unused)
+{
+       int fd;
+       unsigned i;
+       void *virtaddr;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       int node_id = -1;
+       int essential_prev = 0;
+       int oldpolicy;
+       struct bitmask *oldmask = NULL;
+       bool have_numa = true;
+       unsigned long maxnode = 0;
+
+       /* Check if kernel supports NUMA. */
+       if (numa_available() != 0) {
+               RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+               have_numa = false;
+       }
+
+       if (have_numa) {
+               RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+               oldmask = numa_allocate_nodemask();
+               if (get_mempolicy(&oldpolicy, oldmask->maskp,
+                                 oldmask->size + 1, 0, 0) < 0) {
+                       RTE_LOG(ERR, EAL,
+                               "Failed to get current mempolicy: %s. "
+                               "Assuming MPOL_DEFAULT.\n", strerror(errno));
+                       oldpolicy = MPOL_DEFAULT;
+               }
+               for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+                       if (internal_config.socket_mem[i])
+                               maxnode = i + 1;
+       }
+#endif
+
+       for (i = 0; i < hpi->num_pages[0]; i++) {
+               struct hugepage_file *hf = &hugepg_tbl[i];
+               uint64_t hugepage_sz = hpi->hugepage_sz;
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+               if (maxnode) {
+                       unsigned int j;
+
+                       for (j = 0; j < maxnode; j++)
+                               if (essential_memory[j])
+                                       break;
+
+                       if (j == maxnode) {
+                               node_id = (node_id + 1) % maxnode;
+                               while (!internal_config.socket_mem[node_id]) {
+                                       node_id++;
+                                       node_id %= maxnode;
+                               }
+                               essential_prev = 0;
+                       } else {
+                               node_id = j;
+                               essential_prev = essential_memory[j];
+
+                               if (essential_memory[j] < hugepage_sz)
+                                       essential_memory[j] = 0;
+                               else
+                                       essential_memory[j] -= hugepage_sz;
+                       }
+
+                       RTE_LOG(DEBUG, EAL,
+                               "Setting policy MPOL_PREFERRED for socket %d\n",
+                               node_id);
+                       numa_set_preferred(node_id);
+               }
+#endif
+
+               hf->file_id = i;
+               hf->size = hugepage_sz;
+               eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
+                               hpi->hugedir, hf->file_id);
+               hf->filepath[sizeof(hf->filepath) - 1] = '\0';
+
+               /* try to create hugepage file */
+               fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
+               if (fd < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+                                       strerror(errno));
+                       goto out;
+               }
+
+               /* map the segment, and populate page tables,
+                * the kernel fills this segment with zeros. we don't care where
+                * this gets mapped - we already have contiguous memory areas
+                * ready for us to map into.
+                */
+               virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
+                               MAP_SHARED | MAP_POPULATE, fd, 0);
+               if (virtaddr == MAP_FAILED) {
+                       RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
+                                       strerror(errno));
+                       close(fd);
+                       goto out;
+               }
+
+               hf->orig_va = virtaddr;
+
+               /* In linux, hugetlb limitations, like cgroup, are
+                * enforced at fault time instead of mmap(), even
+                * with the option of MAP_POPULATE. Kernel will send
+                * a SIGBUS signal. To avoid to be killed, save stack
+                * environment here, if SIGBUS happens, we can jump
+                * back here.
+                */
+               if (huge_wrap_sigsetjmp()) {
+                       RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+                               "hugepages of size %u MB\n",
+                               (unsigned int)(hugepage_sz / 0x100000));
+                       munmap(virtaddr, hugepage_sz);
+                       close(fd);
+                       unlink(hugepg_tbl[i].filepath);
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+                       if (maxnode)
+                               essential_memory[node_id] =
+                                       essential_prev;
+#endif
+                       goto out;
+               }
+               *(int *)virtaddr = 0;
+
+               /* set shared lock on the file. */
+               if (flock(fd, LOCK_SH) < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
+                               __func__, strerror(errno));
+                       close(fd);
+                       goto out;
+               }
+
+               close(fd);
+       }
+
+out:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       if (maxnode) {
+               RTE_LOG(DEBUG, EAL,
+                       "Restoring previous memory policy: %d\n", oldpolicy);
+               if (oldpolicy == MPOL_DEFAULT) {
+                       numa_set_localalloc();
+               } else if (set_mempolicy(oldpolicy, oldmask->maskp,
+                                        oldmask->size + 1) < 0) {
+                       RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+                               strerror(errno));
+                       numa_set_localalloc();
+               }
+       }
+       if (oldmask != NULL)
+               numa_free_cpumask(oldmask);
+#endif
+       return i;
+}
+
+/*
+ * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
+ * page.
+ */
+static int
+find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
+{
+       int socket_id;
+       char *end, *nodestr;
+       unsigned i, hp_count = 0;
+       uint64_t virt_addr;
+       char buf[BUFSIZ];
+       char hugedir_str[PATH_MAX];
+       FILE *f;
+
+       f = fopen("/proc/self/numa_maps", "r");
+       if (f == NULL) {
+               RTE_LOG(NOTICE, EAL, "NUMA support not available"
+                       " consider that all memory is in socket_id 0\n");
+               return 0;
+       }
+
+       snprintf(hugedir_str, sizeof(hugedir_str),
+                       "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
+
+       /* parse numa map */
+       while (fgets(buf, sizeof(buf), f) != NULL) {
+
+               /* ignore non huge page */
+               if (strstr(buf, " huge ") == NULL &&
+                               strstr(buf, hugedir_str) == NULL)
+                       continue;
+
+               /* get zone addr */
+               virt_addr = strtoull(buf, &end, 16);
+               if (virt_addr == 0 || end == buf) {
+                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+                       goto error;
+               }
+
+               /* get node id (socket id) */
+               nodestr = strstr(buf, " N");
+               if (nodestr == NULL) {
+                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+                       goto error;
+               }
+               nodestr += 2;
+               end = strstr(nodestr, "=");
+               if (end == NULL) {
+                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+                       goto error;
+               }
+               end[0] = '\0';
+               end = NULL;
+
+               socket_id = strtoul(nodestr, &end, 0);
+               if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
+                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
+                       goto error;
+               }
+
+               /* if we find this page in our mappings, set socket_id */
+               for (i = 0; i < hpi->num_pages[0]; i++) {
+                       void *va = (void *)(unsigned long)virt_addr;
+                       if (hugepg_tbl[i].orig_va == va) {
+                               hugepg_tbl[i].socket_id = socket_id;
+                               hp_count++;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+                               RTE_LOG(DEBUG, EAL,
+                                       "Hugepage %s is on socket %d\n",
+                                       hugepg_tbl[i].filepath, socket_id);
+#endif
+                       }
+               }
+       }
+
+       if (hp_count < hpi->num_pages[0])
+               goto error;
+
+       fclose(f);
+       return 0;
+
+error:
+       fclose(f);
+       return -1;
+}
+
+static int
+cmp_physaddr(const void *a, const void *b)
+{
+#ifndef RTE_ARCH_PPC_64
+       const struct hugepage_file *p1 = a;
+       const struct hugepage_file *p2 = b;
+#else
+       /* PowerPC needs memory sorted in reverse order from x86 */
+       const struct hugepage_file *p1 = b;
+       const struct hugepage_file *p2 = a;
+#endif
+       if (p1->physaddr < p2->physaddr)
+               return -1;
+       else if (p1->physaddr > p2->physaddr)
+               return 1;
+       else
+               return 0;
+}
+
+/*
+ * Uses mmap to create a shared memory area for storage of data
+ * Used in this file to store the hugepage file map on disk
+ */
+static void *
+create_shared_memory(const char *filename, const size_t mem_size)
+{
+       void *retval;
+       int fd;
+
+       /* if no shared files mode is used, create anonymous memory instead */
+       if (internal_config.no_shconf) {
+               retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
+                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+               if (retval == MAP_FAILED)
+                       return NULL;
+               return retval;
+       }
+
+       fd = open(filename, O_CREAT | O_RDWR, 0666);
+       if (fd < 0)
+               return NULL;
+       if (ftruncate(fd, mem_size) < 0) {
+               close(fd);
+               return NULL;
+       }
+       retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       close(fd);
+       if (retval == MAP_FAILED)
+               return NULL;
+       return retval;
+}
+
+/*
+ * this copies *active* hugepages from one hugepage table to another.
+ * destination is typically the shared memory.
+ */
+static int
+copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
+               const struct hugepage_file * src, int src_size)
+{
+       int src_pos, dst_pos = 0;
+
+       for (src_pos = 0; src_pos < src_size; src_pos++) {
+               if (src[src_pos].orig_va != NULL) {
+                       /* error on overflow attempt */
+                       if (dst_pos == dest_size)
+                               return -1;
+                       memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
+                       dst_pos++;
+               }
+       }
+       return 0;
+}
+
+static int
+unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
+               unsigned num_hp_info)
+{
+       unsigned socket, size;
+       int page, nrpages = 0;
+
+       /* get total number of hugepages */
+       for (size = 0; size < num_hp_info; size++)
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+                       nrpages +=
+                       internal_config.hugepage_info[size].num_pages[socket];
+
+       for (page = 0; page < nrpages; page++) {
+               struct hugepage_file *hp = &hugepg_tbl[page];
+
+               if (hp->orig_va != NULL && unlink(hp->filepath)) {
+                       RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
+                               __func__, hp->filepath, strerror(errno));
+               }
+       }
+       return 0;
+}
+
+/*
+ * unmaps hugepages that are not going to be used. since we originally allocate
+ * ALL hugepages (not just those we need), additional unmapping needs to be done.
+ */
+static int
+unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
+               struct hugepage_info *hpi,
+               unsigned num_hp_info)
+{
+       unsigned socket, size;
+       int page, nrpages = 0;
+
+       /* get total number of hugepages */
+       for (size = 0; size < num_hp_info; size++)
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
+                       nrpages += internal_config.hugepage_info[size].num_pages[socket];
+
+       for (size = 0; size < num_hp_info; size++) {
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
+                       unsigned pages_found = 0;
+
+                       /* traverse until we have unmapped all the unused pages */
+                       for (page = 0; page < nrpages; page++) {
+                               struct hugepage_file *hp = &hugepg_tbl[page];
+
+                               /* find a page that matches the criteria */
+                               if ((hp->size == hpi[size].hugepage_sz) &&
+                                               (hp->socket_id == (int) socket)) {
+
+                                       /* if we skipped enough pages, unmap the rest */
+                                       if (pages_found == hpi[size].num_pages[socket]) {
+                                               uint64_t unmap_len;
+
+                                               unmap_len = hp->size;
+
+                                               /* get start addr and len of the remaining segment */
+                                               munmap(hp->orig_va,
+                                                       (size_t)unmap_len);
+
+                                               hp->orig_va = NULL;
+                                               if (unlink(hp->filepath) == -1) {
+                                                       RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
+                                                                       __func__, hp->filepath, strerror(errno));
+                                                       return -1;
+                                               }
+                                       } else {
+                                               /* lock the page and skip */
+                                               pages_found++;
+                                       }
+
+                               } /* match page */
+                       } /* foreach page */
+               } /* foreach socket */
+       } /* foreach pagesize */
+
+       return 0;
+}
+
+static int
+remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *msl;
+       struct rte_fbarray *arr;
+       int cur_page, seg_len;
+       unsigned int msl_idx;
+       int ms_idx;
+       uint64_t page_sz;
+       size_t memseg_len;
+       int socket_id;
+
+       page_sz = hugepages[seg_start].size;
+       socket_id = hugepages[seg_start].socket_id;
+       seg_len = seg_end - seg_start;
+
+       RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
+                       (seg_len * page_sz) >> 20ULL, socket_id);
+
+       /* find free space in memseg lists */
+       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+               bool empty;
+               msl = &mcfg->memsegs[msl_idx];
+               arr = &msl->memseg_arr;
+
+               if (msl->page_sz != page_sz)
+                       continue;
+               if (msl->socket_id != socket_id)
+                       continue;
+
+               /* leave space for a hole if array is not empty */
+               empty = arr->count == 0;
+               ms_idx = rte_fbarray_find_next_n_free(arr, 0,
+                               seg_len + (empty ? 0 : 1));
+
+               /* memseg list is full? */
+               if (ms_idx < 0)
+                       continue;
+
+               /* leave some space between memsegs, they are not IOVA
+                * contiguous, so they shouldn't be VA contiguous either.
+                */
+               if (!empty)
+                       ms_idx++;
+               break;
+       }
+       if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+               RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
+                               RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
+                               RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
+               return -1;
+       }
+
+#ifdef RTE_ARCH_PPC64
+       /* for PPC64 we go through the list backwards */
+       for (cur_page = seg_end - 1; cur_page >= seg_start;
+                       cur_page--, ms_idx++) {
+#else
+       for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
+#endif
+               struct hugepage_file *hfile = &hugepages[cur_page];
+               struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+               void *addr;
+               int fd;
+
+               fd = open(hfile->filepath, O_RDWR);
+               if (fd < 0) {
+                       RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
+                                       hfile->filepath, strerror(errno));
+                       return -1;
+               }
+               /* set shared lock on the file. */
+               if (flock(fd, LOCK_SH) < 0) {
+                       RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
+                                       hfile->filepath, strerror(errno));
+                       close(fd);
+                       return -1;
+               }
+               memseg_len = (size_t)page_sz;
+               addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
+
+               /* we know this address is already mmapped by memseg list, so
+                * using MAP_FIXED here is safe
+                */
+               addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
+                               MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
+               if (addr == MAP_FAILED) {
+                       RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
+                                       hfile->filepath, strerror(errno));
+                       close(fd);
+                       return -1;
+               }
+
+               /* we have a new address, so unmap previous one */
+#ifndef RTE_ARCH_64
+               /* in 32-bit legacy mode, we have already unmapped the page */
+               if (!internal_config.legacy_mem)
+                       munmap(hfile->orig_va, page_sz);
+#else
+               munmap(hfile->orig_va, page_sz);
+#endif
+
+               hfile->orig_va = NULL;
+               hfile->final_va = addr;
+
+               /* rewrite physical addresses in IOVA as VA mode */
+               if (rte_eal_iova_mode() == RTE_IOVA_VA)
+                       hfile->physaddr = (uintptr_t)addr;
+
+               /* set up memseg data */
+               ms->addr = addr;
+               ms->hugepage_sz = page_sz;
+               ms->len = memseg_len;
+               ms->iova = hfile->physaddr;
+               ms->socket_id = hfile->socket_id;
+               ms->nchannel = rte_memory_get_nchannel();
+               ms->nrank = rte_memory_get_nrank();
+
+               rte_fbarray_set_used(arr, ms_idx);
+
+               /* store segment fd internally */
+               if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+                       RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+                               rte_strerror(rte_errno));
+       }
+       RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
+                       (seg_len * page_sz) >> 20, socket_id);
+       return 0;
+}
+
+static uint64_t
+get_mem_amount(uint64_t page_sz, uint64_t max_mem)
+{
+       uint64_t area_sz, max_pages;
+
+       /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
+       max_pages = RTE_MAX_MEMSEG_PER_LIST;
+       max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
+
+       area_sz = RTE_MIN(page_sz * max_pages, max_mem);
+
+       /* make sure the list isn't smaller than the page size */
+       area_sz = RTE_MAX(area_sz, page_sz);
+
+       return RTE_ALIGN(area_sz, page_sz);
+}
+
+static int
+free_memseg_list(struct rte_memseg_list *msl)
+{
+       if (rte_fbarray_destroy(&msl->memseg_arr)) {
+               RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n");
+               return -1;
+       }
+       memset(msl, 0, sizeof(*msl));
+       return 0;
+}
+
+#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+               int n_segs, int socket_id, int type_msl_idx)
+{
+       char name[RTE_FBARRAY_NAME_LEN];
+
+       snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
+                type_msl_idx);
+       if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
+                       sizeof(struct rte_memseg))) {
+               RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
+                       rte_strerror(rte_errno));
+               return -1;
+       }
+
+       msl->page_sz = page_sz;
+       msl->socket_id = socket_id;
+       msl->base_va = NULL;
+
+       RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
+                       (size_t)page_sz >> 10, socket_id);
+
+       return 0;
+}
+
+static int
+alloc_va_space(struct rte_memseg_list *msl)
+{
+       uint64_t page_sz;
+       size_t mem_sz;
+       void *addr;
+       int flags = 0;
+
+       page_sz = msl->page_sz;
+       mem_sz = page_sz * msl->memseg_arr.len;
+
+       addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
+       if (addr == NULL) {
+               if (rte_errno == EADDRNOTAVAIL)
+                       RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
+                               (unsigned long long)mem_sz, msl->base_va);
+               else
+                       RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+               return -1;
+       }
+       msl->base_va = addr;
+       msl->len = mem_sz;
+
+       return 0;
+}
+
+/*
+ * Our VA space is not preallocated yet, so preallocate it here. We need to know
+ * how many segments there are in order to map all pages into one address space,
+ * and leave appropriate holes between segments so that rte_malloc does not
+ * concatenate them into one big segment.
+ *
+ * we also need to unmap original pages to free up address space.
+ */
+static int __rte_unused
+prealloc_segments(struct hugepage_file *hugepages, int n_pages)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int cur_page, seg_start_page, end_seg, new_memseg;
+       unsigned int hpi_idx, socket, i;
+       int n_contig_segs, n_segs;
+       int msl_idx;
+
+       /* before we preallocate segments, we need to free up our VA space.
+        * we're not removing files, and we already have information about
+        * PA-contiguousness, so it is safe to unmap everything.
+        */
+       for (cur_page = 0; cur_page < n_pages; cur_page++) {
+               struct hugepage_file *hpi = &hugepages[cur_page];
+               munmap(hpi->orig_va, hpi->size);
+               hpi->orig_va = NULL;
+       }
+
+       /* we cannot know how many page sizes and sockets we have discovered, so
+        * loop over all of them
+        */
+       for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
+                       hpi_idx++) {
+               uint64_t page_sz =
+                       internal_config.hugepage_info[hpi_idx].hugepage_sz;
+
+               for (i = 0; i < rte_socket_count(); i++) {
+                       struct rte_memseg_list *msl;
+
+                       socket = rte_socket_id_by_idx(i);
+                       n_contig_segs = 0;
+                       n_segs = 0;
+                       seg_start_page = -1;
+
+                       for (cur_page = 0; cur_page < n_pages; cur_page++) {
+                               struct hugepage_file *prev, *cur;
+                               int prev_seg_start_page = -1;
+
+                               cur = &hugepages[cur_page];
+                               prev = cur_page == 0 ? NULL :
+                                               &hugepages[cur_page - 1];
+
+                               new_memseg = 0;
+                               end_seg = 0;
+
+                               if (cur->size == 0)
+                                       end_seg = 1;
+                               else if (cur->socket_id != (int) socket)
+                                       end_seg = 1;
+                               else if (cur->size != page_sz)
+                                       end_seg = 1;
+                               else if (cur_page == 0)
+                                       new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+                               /* On PPC64 architecture, the mmap always start
+                                * from higher address to lower address. Here,
+                                * physical addresses are in descending order.
+                                */
+                               else if ((prev->physaddr - cur->physaddr) !=
+                                               cur->size)
+                                       new_memseg = 1;
+#else
+                               else if ((cur->physaddr - prev->physaddr) !=
+                                               cur->size)
+                                       new_memseg = 1;
+#endif
+                               if (new_memseg) {
+                                       /* if we're already inside a segment,
+                                        * new segment means end of current one
+                                        */
+                                       if (seg_start_page != -1) {
+                                               end_seg = 1;
+                                               prev_seg_start_page =
+                                                               seg_start_page;
+                                       }
+                                       seg_start_page = cur_page;
+                               }
+
+                               if (end_seg) {
+                                       if (prev_seg_start_page != -1) {
+                                               /* we've found a new segment */
+                                               n_contig_segs++;
+                                               n_segs += cur_page -
+                                                       prev_seg_start_page;
+                                       } else if (seg_start_page != -1) {
+                                               /* we didn't find new segment,
+                                                * but did end current one
+                                                */
+                                               n_contig_segs++;
+                                               n_segs += cur_page -
+                                                               seg_start_page;
+                                               seg_start_page = -1;
+                                               continue;
+                                       } else {
+                                               /* we're skipping this page */
+                                               continue;
+                                       }
+                               }
+                               /* segment continues */
+                       }
+                       /* check if we missed last segment */
+                       if (seg_start_page != -1) {
+                               n_contig_segs++;
+                               n_segs += cur_page - seg_start_page;
+                       }
+
+                       /* if no segments were found, do not preallocate */
+                       if (n_segs == 0)
+                               continue;
+
+                       /* we now have total number of pages that we will
+                        * allocate for this segment list. add separator pages
+                        * to the total count, and preallocate VA space.
+                        */
+                       n_segs += n_contig_segs - 1;
+
+                       /* now, preallocate VA space for these segments */
+
+                       /* first, find suitable memseg list for this */
+                       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
+                                       msl_idx++) {
+                               msl = &mcfg->memsegs[msl_idx];
+
+                               if (msl->base_va != NULL)
+                                       continue;
+                               break;
+                       }
+                       if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
+                               RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
+                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+                               return -1;
+                       }
+
+                       /* now, allocate fbarray itself */
+                       if (alloc_memseg_list(msl, page_sz, n_segs, socket,
+                                               msl_idx) < 0)
+                               return -1;
+
+                       /* finally, allocate VA space */
+                       if (alloc_va_space(msl) < 0)
+                               return -1;
+               }
+       }
+       return 0;
+}
+
+/*
+ * We cannot reallocate memseg lists on the fly because PPC64 stores pages
+ * backwards, therefore we have to process the entire memseg first before
+ * remapping it into memseg list VA space.
+ */
+static int
+remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
+{
+       int cur_page, seg_start_page, new_memseg, ret;
+
+       seg_start_page = 0;
+       for (cur_page = 0; cur_page < n_pages; cur_page++) {
+               struct hugepage_file *prev, *cur;
+
+               new_memseg = 0;
+
+               cur = &hugepages[cur_page];
+               prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
+
+               /* if size is zero, no more pages left */
+               if (cur->size == 0)
+                       break;
+
+               if (cur_page == 0)
+                       new_memseg = 1;
+               else if (cur->socket_id != prev->socket_id)
+                       new_memseg = 1;
+               else if (cur->size != prev->size)
+                       new_memseg = 1;
+#ifdef RTE_ARCH_PPC_64
+               /* On PPC64 architecture, the mmap always start from higher
+                * address to lower address. Here, physical addresses are in
+                * descending order.
+                */
+               else if ((prev->physaddr - cur->physaddr) != cur->size)
+                       new_memseg = 1;
+#else
+               else if ((cur->physaddr - prev->physaddr) != cur->size)
+                       new_memseg = 1;
+#endif
+
+               if (new_memseg) {
+                       /* if this isn't the first time, remap segment */
+                       if (cur_page != 0) {
+                               ret = remap_segment(hugepages, seg_start_page,
+                                               cur_page);
+                               if (ret != 0)
+                                       return -1;
+                       }
+                       /* remember where we started */
+                       seg_start_page = cur_page;
+               }
+               /* continuation of previous memseg */
+       }
+       /* we were stopped, but we didn't remap the last segment, do it now */
+       if (cur_page != 0) {
+               ret = remap_segment(hugepages, seg_start_page,
+                               cur_page);
+               if (ret != 0)
+                       return -1;
+       }
+       return 0;
+}
+
+static inline uint64_t
+get_socket_mem_size(int socket)
+{
+       uint64_t size = 0;
+       unsigned i;
+
+       for (i = 0; i < internal_config.num_hugepage_sizes; i++){
+               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+               size += hpi->hugepage_sz * hpi->num_pages[socket];
+       }
+
+       return size;
+}
+
+/*
+ * This function is a NUMA-aware equivalent of calc_num_pages.
+ * It takes in the list of hugepage sizes and the
+ * number of pages thereof, and calculates the best number of
+ * pages of each size to fulfill the request for <memory> ram
+ */
+static int
+calc_num_pages_per_socket(uint64_t * memory,
+               struct hugepage_info *hp_info,
+               struct hugepage_info *hp_used,
+               unsigned num_hp_info)
+{
+       unsigned socket, j, i = 0;
+       unsigned requested, available;
+       int total_num_pages = 0;
+       uint64_t remaining_mem, cur_mem;
+       uint64_t total_mem = internal_config.memory;
+
+       if (num_hp_info == 0)
+               return -1;
+
+       /* if specific memory amounts per socket weren't requested */
+       if (internal_config.force_sockets == 0) {
+               size_t total_size;
+#ifdef RTE_ARCH_64
+               int cpu_per_socket[RTE_MAX_NUMA_NODES];
+               size_t default_size;
+               unsigned lcore_id;
+
+               /* Compute number of cores per socket */
+               memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
+               RTE_LCORE_FOREACH(lcore_id) {
+                       cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
+               }
+
+               /*
+                * Automatically spread requested memory amongst detected sockets according
+                * to number of cores from cpu mask present on each socket
+                */
+               total_size = internal_config.memory;
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+
+                       /* Set memory amount per socket */
+                       default_size = (internal_config.memory * cpu_per_socket[socket])
+                                       / rte_lcore_count();
+
+                       /* Limit to maximum available memory on socket */
+                       default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
+
+                       /* Update sizes */
+                       memory[socket] = default_size;
+                       total_size -= default_size;
+               }
+
+               /*
+                * If some memory is remaining, try to allocate it by getting all
+                * available memory from sockets, one after the other
+                */
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
+                       /* take whatever is available */
+                       default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
+                                              total_size);
+
+                       /* Update sizes */
+                       memory[socket] += default_size;
+                       total_size -= default_size;
+               }
+#else
+               /* in 32-bit mode, allocate all of the memory only on master
+                * lcore socket
+                */
+               total_size = internal_config.memory;
+               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
+                               socket++) {
+                       struct rte_config *cfg = rte_eal_get_configuration();
+                       unsigned int master_lcore_socket;
+
+                       master_lcore_socket =
+                               rte_lcore_to_socket_id(cfg->master_lcore);
+
+                       if (master_lcore_socket != socket)
+                               continue;
+
+                       /* Update sizes */
+                       memory[socket] = total_size;
+                       break;
+               }
+#endif
+       }
+
+       for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
+               /* skips if the memory on specific socket wasn't requested */
+               for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
+                       strlcpy(hp_used[i].hugedir, hp_info[i].hugedir,
+                               sizeof(hp_used[i].hugedir));
+                       hp_used[i].num_pages[socket] = RTE_MIN(
+                                       memory[socket] / hp_info[i].hugepage_sz,
+                                       hp_info[i].num_pages[socket]);
+
+                       cur_mem = hp_used[i].num_pages[socket] *
+                                       hp_used[i].hugepage_sz;
+
+                       memory[socket] -= cur_mem;
+                       total_mem -= cur_mem;
+
+                       total_num_pages += hp_used[i].num_pages[socket];
+
+                       /* check if we have met all memory requests */
+                       if (memory[socket] == 0)
+                               break;
+
+                       /* check if we have any more pages left at this size, if so
+                        * move on to next size */
+                       if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
+                               continue;
+                       /* At this point we know that there are more pages available that are
+                        * bigger than the memory we want, so lets see if we can get enough
+                        * from other page sizes.
+                        */
+                       remaining_mem = 0;
+                       for (j = i+1; j < num_hp_info; j++)
+                               remaining_mem += hp_info[j].hugepage_sz *
+                               hp_info[j].num_pages[socket];
+
+                       /* is there enough other memory, if not allocate another page and quit */
+                       if (remaining_mem < memory[socket]){
+                               cur_mem = RTE_MIN(memory[socket],
+                                               hp_info[i].hugepage_sz);
+                               memory[socket] -= cur_mem;
+                               total_mem -= cur_mem;
+                               hp_used[i].num_pages[socket]++;
+                               total_num_pages++;
+                               break; /* we are done with this socket*/
+                       }
+               }
+               /* if we didn't satisfy all memory requirements per socket */
+               if (memory[socket] > 0 &&
+                               internal_config.socket_mem[socket] != 0) {
+                       /* to prevent icc errors */
+                       requested = (unsigned) (internal_config.socket_mem[socket] /
+                                       0x100000);
+                       available = requested -
+                                       ((unsigned) (memory[socket] / 0x100000));
+                       RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
+                                       "Requested: %uMB, available: %uMB\n", socket,
+                                       requested, available);
+                       return -1;
+               }
+       }
+
+       /* if we didn't satisfy total memory requirements */
+       if (total_mem > 0) {
+               requested = (unsigned) (internal_config.memory / 0x100000);
+               available = requested - (unsigned) (total_mem / 0x100000);
+               RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
+                               " available: %uMB\n", requested, available);
+               return -1;
+       }
+       return total_num_pages;
+}
+
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+       uint64_t size = 0;
+       unsigned i, j;
+
+       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
+                       for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+                               size += hpi->hugepage_sz * hpi->num_pages[j];
+                       }
+               }
+       }
+
+       return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void
+huge_register_sigbus(void)
+{
+       sigset_t mask;
+       struct sigaction action;
+
+       sigemptyset(&mask);
+       sigaddset(&mask, SIGBUS);
+       action.sa_flags = 0;
+       action.sa_mask = mask;
+       action.sa_handler = huge_sigbus_handler;
+
+       huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void
+huge_recover_sigbus(void)
+{
+       if (huge_need_recover) {
+               sigaction(SIGBUS, &huge_action_old, NULL);
+               huge_need_recover = 0;
+       }
+}
+
+/*
+ * Prepare physical memory mapping: fill configuration structure with
+ * these infos, return 0 on success.
+ *  1. map N huge pages in separate files in hugetlbfs
+ *  2. find associated physical addr
+ *  3. find associated NUMA socket ID
+ *  4. sort all huge pages by physical address
+ *  5. remap these N huge pages in the correct order
+ *  6. unmap the first mapping
+ *  7. fill memsegs in configuration with contiguous zones
+ */
+static int
+eal_legacy_hugepage_init(void)
+{
+       struct rte_mem_config *mcfg;
+       struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
+       struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+       struct rte_fbarray *arr;
+       struct rte_memseg *ms;
+
+       uint64_t memory[RTE_MAX_NUMA_NODES];
+
+       unsigned hp_offset;
+       int i, j;
+       int nr_hugefiles, nr_hugepages = 0;
+       void *addr;
+
+       test_phys_addrs_available();
+
+       memset(used_hp, 0, sizeof(used_hp));
+
+       /* get pointer to global configuration */
+       mcfg = rte_eal_get_configuration()->mem_config;
+
+       /* hugetlbfs can be disabled */
+       if (internal_config.no_hugetlbfs) {
+               struct rte_memseg_list *msl;
+               int n_segs, cur_seg, fd, flags;
+#ifdef MEMFD_SUPPORTED
+               int memfd;
+#endif
+               uint64_t page_sz;
+
+               /* nohuge mode is legacy mode */
+               internal_config.legacy_mem = 1;
+
+               /* nohuge mode is single-file segments mode */
+               internal_config.single_file_segments = 1;
+
+               /* create a memseg list */
+               msl = &mcfg->memsegs[0];
+
+               page_sz = RTE_PGSIZE_4K;
+               n_segs = internal_config.memory / page_sz;
+
+               if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
+                                       sizeof(struct rte_memseg))) {
+                       RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+                       return -1;
+               }
+
+               /* set up parameters for anonymous mmap */
+               fd = -1;
+               flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+#ifdef MEMFD_SUPPORTED
+               /* create a memfd and store it in the segment fd table */
+               memfd = memfd_create("nohuge", 0);
+               if (memfd < 0) {
+                       RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
+                                       strerror(errno));
+                       RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
+               } else {
+                       /* we got an fd - now resize it */
+                       if (ftruncate(memfd, internal_config.memory) < 0) {
+                               RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
+                                               strerror(errno));
+                               RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
+                               close(memfd);
+                       } else {
+                               /* creating memfd-backed file was successful.
+                                * we want changes to memfd to be visible to
+                                * other processes (such as vhost backend), so
+                                * map it as shared memory.
+                                */
+                               RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
+                               fd = memfd;
+                               flags = MAP_SHARED;
+                       }
+               }
+#endif
+               addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
+                               flags, fd, 0);
+               if (addr == MAP_FAILED) {
+                       RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
+                                       strerror(errno));
+                       return -1;
+               }
+               msl->base_va = addr;
+               msl->page_sz = page_sz;
+               msl->socket_id = 0;
+               msl->len = internal_config.memory;
+
+               /* we're in single-file segments mode, so only the segment list
+                * fd needs to be set up.
+                */
+               if (fd != -1) {
+                       if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
+                               RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
+                               /* not a serious error, proceed */
+                       }
+               }
+
+               /* populate memsegs. each memseg is one page long */
+               for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
+                       arr = &msl->memseg_arr;
+
+                       ms = rte_fbarray_get(arr, cur_seg);
+                       if (rte_eal_iova_mode() == RTE_IOVA_VA)
+                               ms->iova = (uintptr_t)addr;
+                       else
+                               ms->iova = RTE_BAD_IOVA;
+                       ms->addr = addr;
+                       ms->hugepage_sz = page_sz;
+                       ms->socket_id = 0;
+                       ms->len = page_sz;
+
+                       rte_fbarray_set_used(arr, cur_seg);
+
+                       addr = RTE_PTR_ADD(addr, (size_t)page_sz);
+               }
+               if (mcfg->dma_maskbits &&
+                   rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+                       RTE_LOG(ERR, EAL,
+                               "%s(): couldnt allocate memory due to IOVA exceeding limits of current DMA mask.\n",
+                               __func__);
+                       if (rte_eal_iova_mode() == RTE_IOVA_VA &&
+                           rte_eal_using_phys_addrs())
+                               RTE_LOG(ERR, EAL,
+                                       "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
+                                       __func__);
+                       goto fail;
+               }
+               return 0;
+       }
+
+       /* calculate total number of hugepages available. at this point we haven't
+        * yet started sorting them so they all are on socket 0 */
+       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+               /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
+               used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
+
+               nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
+       }
+
+       /*
+        * allocate a memory area for hugepage table.
+        * this isn't shared memory yet. due to the fact that we need some
+        * processing done on these pages, shared memory will be created
+        * at a later stage.
+        */
+       tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
+       if (tmp_hp == NULL)
+               goto fail;
+
+       memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
+
+       hp_offset = 0; /* where we start the current page size entries */
+
+       huge_register_sigbus();
+
+       /* make a copy of socket_mem, needed for balanced allocation. */
+       for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+               memory[i] = internal_config.socket_mem[i];
+
+       /* map all hugepages and sort them */
+       for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+               unsigned pages_old, pages_new;
+               struct hugepage_info *hpi;
+
+               /*
+                * we don't yet mark hugepages as used at this stage, so
+                * we just map all hugepages available to the system
+                * all hugepages are still located on socket 0
+                */
+               hpi = &internal_config.hugepage_info[i];
+
+               if (hpi->num_pages[0] == 0)
+                       continue;
+
+               /* map all hugepages available */
+               pages_old = hpi->num_pages[0];
+               pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
+               if (pages_new < pages_old) {
+                       RTE_LOG(DEBUG, EAL,
+                               "%d not %d hugepages of size %u MB allocated\n",
+                               pages_new, pages_old,
+                               (unsigned)(hpi->hugepage_sz / 0x100000));
+
+                       int pages = pages_old - pages_new;
+
+                       nr_hugepages -= pages;
+                       hpi->num_pages[0] = pages_new;
+                       if (pages_new == 0)
+                               continue;
+               }
+
+               if (phys_addrs_available &&
+                               rte_eal_iova_mode() != RTE_IOVA_VA) {
+                       /* find physical addresses for each hugepage */
+                       if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+                               RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
+                                       "for %u MB pages\n",
+                                       (unsigned int)(hpi->hugepage_sz / 0x100000));
+                               goto fail;
+                       }
+               } else {
+                       /* set physical addresses for each hugepage */
+                       if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
+                               RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
+                                       "for %u MB pages\n",
+                                       (unsigned int)(hpi->hugepage_sz / 0x100000));
+                               goto fail;
+                       }
+               }
+
+               if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
+                       RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
+                                       (unsigned)(hpi->hugepage_sz / 0x100000));
+                       goto fail;
+               }
+
+               qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
+                     sizeof(struct hugepage_file), cmp_physaddr);
+
+               /* we have processed a num of hugepages of this size, so inc offset */
+               hp_offset += hpi->num_pages[0];
+       }
+
+       huge_recover_sigbus();
+
+       if (internal_config.memory == 0 && internal_config.force_sockets == 0)
+               internal_config.memory = eal_get_hugepage_mem_size();
+
+       nr_hugefiles = nr_hugepages;
+
+
+       /* clean out the numbers of pages */
+       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
+               for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
+                       internal_config.hugepage_info[i].num_pages[j] = 0;
+
+       /* get hugepages for each socket */
+       for (i = 0; i < nr_hugefiles; i++) {
+               int socket = tmp_hp[i].socket_id;
+
+               /* find a hugepage info with right size and increment num_pages */
+               const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
+                               (int)internal_config.num_hugepage_sizes);
+               for (j = 0; j < nb_hpsizes; j++) {
+                       if (tmp_hp[i].size ==
+                                       internal_config.hugepage_info[j].hugepage_sz) {
+                               internal_config.hugepage_info[j].num_pages[socket]++;
+                       }
+               }
+       }
+
+       /* make a copy of socket_mem, needed for number of pages calculation */
+       for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
+               memory[i] = internal_config.socket_mem[i];
+
+       /* calculate final number of pages */
+       nr_hugepages = calc_num_pages_per_socket(memory,
+                       internal_config.hugepage_info, used_hp,
+                       internal_config.num_hugepage_sizes);
+
+       /* error if not enough memory available */
+       if (nr_hugepages < 0)
+               goto fail;
+
+       /* reporting in! */
+       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
+               for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+                       if (used_hp[i].num_pages[j] > 0) {
+                               RTE_LOG(DEBUG, EAL,
+                                       "Requesting %u pages of size %uMB"
+                                       " from socket %i\n",
+                                       used_hp[i].num_pages[j],
+                                       (unsigned)
+                                       (used_hp[i].hugepage_sz / 0x100000),
+                                       j);
+                       }
+               }
+       }
+
+       /* create shared memory */
+       hugepage = create_shared_memory(eal_hugepage_data_path(),
+                       nr_hugefiles * sizeof(struct hugepage_file));
+
+       if (hugepage == NULL) {
+               RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
+               goto fail;
+       }
+       memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
+
+       /*
+        * unmap pages that we won't need (looks at used_hp).
+        * also, sets final_va to NULL on pages that were unmapped.
+        */
+       if (unmap_unneeded_hugepages(tmp_hp, used_hp,
+                       internal_config.num_hugepage_sizes) < 0) {
+               RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
+               goto fail;
+       }
+
+       /*
+        * copy stuff from malloc'd hugepage* to the actual shared memory.
+        * this procedure only copies those hugepages that have orig_va
+        * not NULL. has overflow protection.
+        */
+       if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
+                       tmp_hp, nr_hugefiles) < 0) {
+               RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
+               goto fail;
+       }
+
+#ifndef RTE_ARCH_64
+       /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
+       if (internal_config.legacy_mem &&
+                       prealloc_segments(hugepage, nr_hugefiles)) {
+               RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
+               goto fail;
+       }
+#endif
+
+       /* remap all pages we do need into memseg list VA space, so that those
+        * pages become first-class citizens in DPDK memory subsystem
+        */
+       if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
+               RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
+               goto fail;
+       }
+
+       /* free the hugepage backing files */
+       if (internal_config.hugepage_unlink &&
+               unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
+               RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
+               goto fail;
+       }
+
+       /* free the temporary hugepage table */
+       free(tmp_hp);
+       tmp_hp = NULL;
+
+       munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
+       hugepage = NULL;
+
+       /* we're not going to allocate more pages, so release VA space for
+        * unused memseg lists
+        */
+       for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+               struct rte_memseg_list *msl = &mcfg->memsegs[i];
+               size_t mem_sz;
+
+               /* skip inactive lists */
+               if (msl->base_va == NULL)
+                       continue;
+               /* skip lists where there is at least one page allocated */
+               if (msl->memseg_arr.count > 0)
+                       continue;
+               /* this is an unused list, deallocate it */
+               mem_sz = msl->len;
+               munmap(msl->base_va, mem_sz);
+               msl->base_va = NULL;
+
+               /* destroy backing fbarray */
+               rte_fbarray_destroy(&msl->memseg_arr);
+       }
+
+       if (mcfg->dma_maskbits &&
+           rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+               RTE_LOG(ERR, EAL,
+                       "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
+                       __func__);
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       huge_recover_sigbus();
+       free(tmp_hp);
+       if (hugepage != NULL)
+               munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
+
+       return -1;
+}
+
+static int __rte_unused
+hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct hugepage_info *hpi = arg;
+
+       if (msl->page_sz != hpi->hugepage_sz)
+               return 0;
+
+       hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
+       return 0;
+}
+
+static int
+limits_callback(int socket_id, size_t cur_limit, size_t new_len)
+{
+       RTE_SET_USED(socket_id);
+       RTE_SET_USED(cur_limit);
+       RTE_SET_USED(new_len);
+       return -1;
+}
+
+static int
+eal_hugepage_init(void)
+{
+       struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+       uint64_t memory[RTE_MAX_NUMA_NODES];
+       int hp_sz_idx, socket_id;
+
+       test_phys_addrs_available();
+
+       memset(used_hp, 0, sizeof(used_hp));
+
+       for (hp_sz_idx = 0;
+                       hp_sz_idx < (int) internal_config.num_hugepage_sizes;
+                       hp_sz_idx++) {
+#ifndef RTE_ARCH_64
+               struct hugepage_info dummy;
+               unsigned int i;
+#endif
+               /* also initialize used_hp hugepage sizes in used_hp */
+               struct hugepage_info *hpi;
+               hpi = &internal_config.hugepage_info[hp_sz_idx];
+               used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
+
+#ifndef RTE_ARCH_64
+               /* for 32-bit, limit number of pages on socket to whatever we've
+                * preallocated, as we cannot allocate more.
+                */
+               memset(&dummy, 0, sizeof(dummy));
+               dummy.hugepage_sz = hpi->hugepage_sz;
+               if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
+                       return -1;
+
+               for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
+                       hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
+                                       dummy.num_pages[i]);
+               }
+#endif
+       }
+
+       /* make a copy of socket_mem, needed for balanced allocation. */
+       for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
+               memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
+
+       /* calculate final number of pages */
+       if (calc_num_pages_per_socket(memory,
+                       internal_config.hugepage_info, used_hp,
+                       internal_config.num_hugepage_sizes) < 0)
+               return -1;
+
+       for (hp_sz_idx = 0;
+                       hp_sz_idx < (int)internal_config.num_hugepage_sizes;
+                       hp_sz_idx++) {
+               for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
+                               socket_id++) {
+                       struct rte_memseg **pages;
+                       struct hugepage_info *hpi = &used_hp[hp_sz_idx];
+                       unsigned int num_pages = hpi->num_pages[socket_id];
+                       int num_pages_alloc, i;
+
+                       if (num_pages == 0)
+                               continue;
+
+                       pages = malloc(sizeof(*pages) * num_pages);
+
+                       RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
+                               num_pages, hpi->hugepage_sz >> 20, socket_id);
+
+                       num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
+                                       num_pages, hpi->hugepage_sz,
+                                       socket_id, true);
+                       if (num_pages_alloc < 0) {
+                               free(pages);
+                               return -1;
+                       }
+
+                       /* mark preallocated pages as unfreeable */
+                       for (i = 0; i < num_pages_alloc; i++) {
+                               struct rte_memseg *ms = pages[i];
+                               ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
+                       }
+                       free(pages);
+               }
+       }
+       /* if socket limits were specified, set them */
+       if (internal_config.force_socket_limits) {
+               unsigned int i;
+               for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
+                       uint64_t limit = internal_config.socket_limit[i];
+                       if (limit == 0)
+                               continue;
+                       if (rte_mem_alloc_validator_register("socket-limit",
+                                       limits_callback, i, limit))
+                               RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
+               }
+       }
+       return 0;
+}
+
+/*
+ * uses fstat to report the size of a file on disk
+ */
+static off_t
+getFileSize(int fd)
+{
+       struct stat st;
+       if (fstat(fd, &st) < 0)
+               return 0;
+       return st.st_size;
+}
+
+/*
+ * This creates the memory mappings in the secondary process to match that of
+ * the server process. It goes through each memory segment in the DPDK runtime
+ * configuration and finds the hugepages which form that segment, mapping them
+ * in order to form a contiguous block in the virtual memory space
+ */
+static int
+eal_legacy_hugepage_attach(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct hugepage_file *hp = NULL;
+       unsigned int num_hp = 0;
+       unsigned int i = 0;
+       unsigned int cur_seg;
+       off_t size = 0;
+       int fd, fd_hugepage = -1;
+
+       if (aslr_enabled() > 0) {
+               RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
+                               "(ASLR) is enabled in the kernel.\n");
+               RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
+                               "into secondary processes\n");
+       }
+
+       test_phys_addrs_available();
+
+       fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
+       if (fd_hugepage < 0) {
+               RTE_LOG(ERR, EAL, "Could not open %s\n",
+                               eal_hugepage_data_path());
+               goto error;
+       }
+
+       size = getFileSize(fd_hugepage);
+       hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
+       if (hp == MAP_FAILED) {
+               RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+                               eal_hugepage_data_path());
+               goto error;
+       }
+
+       num_hp = size / sizeof(struct hugepage_file);
+       RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
+
+       /* map all segments into memory to make sure we get the addrs. the
+        * segments themselves are already in memseg list (which is shared and
+        * has its VA space already preallocated), so we just need to map
+        * everything into correct addresses.
+        */
+       for (i = 0; i < num_hp; i++) {
+               struct hugepage_file *hf = &hp[i];
+               size_t map_sz = hf->size;
+               void *map_addr = hf->final_va;
+               int msl_idx, ms_idx;
+               struct rte_memseg_list *msl;
+               struct rte_memseg *ms;
+
+               /* if size is zero, no more pages left */
+               if (map_sz == 0)
+                       break;
+
+               fd = open(hf->filepath, O_RDWR);
+               if (fd < 0) {
+                       RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
+                               hf->filepath, strerror(errno));
+                       goto error;
+               }
+
+               map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
+                               MAP_SHARED | MAP_FIXED, fd, 0);
+               if (map_addr == MAP_FAILED) {
+                       RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
+                               hf->filepath, strerror(errno));
+                       goto fd_error;
+               }
+
+               /* set shared lock on the file. */
+               if (flock(fd, LOCK_SH) < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
+                               __func__, strerror(errno));
+                       goto fd_error;
+               }
+
+               /* find segment data */
+               msl = rte_mem_virt2memseg_list(map_addr);
+               if (msl == NULL) {
+                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
+                               __func__);
+                       goto fd_error;
+               }
+               ms = rte_mem_virt2memseg(map_addr, msl);
+               if (ms == NULL) {
+                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
+                               __func__);
+                       goto fd_error;
+               }
+
+               msl_idx = msl - mcfg->memsegs;
+               ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
+               if (ms_idx < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
+                               __func__);
+                       goto fd_error;
+               }
+
+               /* store segment fd internally */
+               if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
+                       RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
+                               rte_strerror(rte_errno));
+       }
+       /* unmap the hugepage config file, since we are done using it */
+       munmap(hp, size);
+       close(fd_hugepage);
+       return 0;
+
+fd_error:
+       close(fd);
+error:
+       /* map all segments into memory to make sure we get the addrs */
+       cur_seg = 0;
+       for (cur_seg = 0; cur_seg < i; cur_seg++) {
+               struct hugepage_file *hf = &hp[i];
+               size_t map_sz = hf->size;
+               void *map_addr = hf->final_va;
+
+               munmap(map_addr, map_sz);
+       }
+       if (hp != NULL && hp != MAP_FAILED)
+               munmap(hp, size);
+       if (fd_hugepage >= 0)
+               close(fd_hugepage);
+       return -1;
+}
+
+static int
+eal_hugepage_attach(void)
+{
+       if (eal_memalloc_sync_with_primary()) {
+               RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
+               if (aslr_enabled() > 0)
+                       RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
+               return -1;
+       }
+       return 0;
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+       return internal_config.legacy_mem ?
+                       eal_legacy_hugepage_init() :
+                       eal_hugepage_init();
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+       return internal_config.legacy_mem ?
+                       eal_legacy_hugepage_attach() :
+                       eal_hugepage_attach();
+}
+
+int
+rte_eal_using_phys_addrs(void)
+{
+       return phys_addrs_available;
+}
+
+static int __rte_unused
+memseg_primary_init_32(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int active_sockets, hpi_idx, msl_idx = 0;
+       unsigned int socket_id, i;
+       struct rte_memseg_list *msl;
+       uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
+       uint64_t max_mem;
+
+       /* no-huge does not need this at all */
+       if (internal_config.no_hugetlbfs)
+               return 0;
+
+       /* this is a giant hack, but desperate times call for desperate
+        * measures. in legacy 32-bit mode, we cannot preallocate VA space,
+        * because having upwards of 2 gigabytes of VA space already mapped will
+        * interfere with our ability to map and sort hugepages.
+        *
+        * therefore, in legacy 32-bit mode, we will be initializing memseg
+        * lists much later - in eal_memory.c, right after we unmap all the
+        * unneeded pages. this will not affect secondary processes, as those
+        * should be able to mmap the space without (too many) problems.
+        */
+       if (internal_config.legacy_mem)
+               return 0;
+
+       /* 32-bit mode is a very special case. we cannot know in advance where
+        * the user will want to allocate their memory, so we have to do some
+        * heuristics.
+        */
+       active_sockets = 0;
+       total_requested_mem = 0;
+       if (internal_config.force_sockets)
+               for (i = 0; i < rte_socket_count(); i++) {
+                       uint64_t mem;
+
+                       socket_id = rte_socket_id_by_idx(i);
+                       mem = internal_config.socket_mem[socket_id];
+
+                       if (mem == 0)
+                               continue;
+
+                       active_sockets++;
+                       total_requested_mem += mem;
+               }
+       else
+               total_requested_mem = internal_config.memory;
+
+       max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+       if (total_requested_mem > max_mem) {
+               RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
+                               (unsigned int)(max_mem >> 20));
+               return -1;
+       }
+       total_extra_mem = max_mem - total_requested_mem;
+       extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
+                       total_extra_mem / active_sockets;
+
+       /* the allocation logic is a little bit convoluted, but here's how it
+        * works, in a nutshell:
+        *  - if user hasn't specified on which sockets to allocate memory via
+        *    --socket-mem, we allocate all of our memory on master core socket.
+        *  - if user has specified sockets to allocate memory on, there may be
+        *    some "unused" memory left (e.g. if user has specified --socket-mem
+        *    such that not all memory adds up to 2 gigabytes), so add it to all
+        *    sockets that are in use equally.
+        *
+        * page sizes are sorted by size in descending order, so we can safely
+        * assume that we dispense with bigger page sizes first.
+        */
+
+       /* create memseg lists */
+       for (i = 0; i < rte_socket_count(); i++) {
+               int hp_sizes = (int) internal_config.num_hugepage_sizes;
+               uint64_t max_socket_mem, cur_socket_mem;
+               unsigned int master_lcore_socket;
+               struct rte_config *cfg = rte_eal_get_configuration();
+               bool skip;
+
+               socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+               if (socket_id > 0)
+                       break;
+#endif
+
+               /* if we didn't specifically request memory on this socket */
+               skip = active_sockets != 0 &&
+                               internal_config.socket_mem[socket_id] == 0;
+               /* ...or if we didn't specifically request memory on *any*
+                * socket, and this is not master lcore
+                */
+               master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
+               skip |= active_sockets == 0 && socket_id != master_lcore_socket;
+
+               if (skip) {
+                       RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
+                                       socket_id);
+                       continue;
+               }
+
+               /* max amount of memory on this socket */
+               max_socket_mem = (active_sockets != 0 ?
+                                       internal_config.socket_mem[socket_id] :
+                                       internal_config.memory) +
+                                       extra_mem_per_socket;
+               cur_socket_mem = 0;
+
+               for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
+                       uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
+                       uint64_t hugepage_sz;
+                       struct hugepage_info *hpi;
+                       int type_msl_idx, max_segs, total_segs = 0;
+
+                       hpi = &internal_config.hugepage_info[hpi_idx];
+                       hugepage_sz = hpi->hugepage_sz;
+
+                       /* check if pages are actually available */
+                       if (hpi->num_pages[socket_id] == 0)
+                               continue;
+
+                       max_segs = RTE_MAX_MEMSEG_PER_TYPE;
+                       max_pagesz_mem = max_socket_mem - cur_socket_mem;
+
+                       /* make it multiple of page size */
+                       max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
+                                       hugepage_sz);
+
+                       RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
+                                       "%" PRIu64 "M on socket %i\n",
+                                       max_pagesz_mem >> 20, socket_id);
+
+                       type_msl_idx = 0;
+                       while (cur_pagesz_mem < max_pagesz_mem &&
+                                       total_segs < max_segs) {
+                               uint64_t cur_mem;
+                               unsigned int n_segs;
+
+                               if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+                                       RTE_LOG(ERR, EAL,
+                                               "No more space in memseg lists, please increase %s\n",
+                                               RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+                                       return -1;
+                               }
+
+                               msl = &mcfg->memsegs[msl_idx];
+
+                               cur_mem = get_mem_amount(hugepage_sz,
+                                               max_pagesz_mem);
+                               n_segs = cur_mem / hugepage_sz;
+
+                               if (alloc_memseg_list(msl, hugepage_sz, n_segs,
+                                               socket_id, type_msl_idx)) {
+                                       /* failing to allocate a memseg list is
+                                        * a serious error.
+                                        */
+                                       RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+                                       return -1;
+                               }
+
+                               if (alloc_va_space(msl)) {
+                                       /* if we couldn't allocate VA space, we
+                                        * can try with smaller page sizes.
+                                        */
+                                       RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n");
+                                       /* deallocate memseg list */
+                                       if (free_memseg_list(msl))
+                                               return -1;
+                                       break;
+                               }
+
+                               total_segs += msl->memseg_arr.len;
+                               cur_pagesz_mem = total_segs * hugepage_sz;
+                               type_msl_idx++;
+                               msl_idx++;
+                       }
+                       cur_socket_mem += cur_pagesz_mem;
+               }
+               if (cur_socket_mem == 0) {
+                       RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n",
+                               socket_id);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static int __rte_unused
+memseg_primary_init(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct memtype {
+               uint64_t page_sz;
+               int socket_id;
+       } *memtypes = NULL;
+       int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
+       struct rte_memseg_list *msl;
+       uint64_t max_mem, max_mem_per_type;
+       unsigned int max_seglists_per_type;
+       unsigned int n_memtypes, cur_type;
+
+       /* no-huge does not need this at all */
+       if (internal_config.no_hugetlbfs)
+               return 0;
+
+       /*
+        * figuring out amount of memory we're going to have is a long and very
+        * involved process. the basic element we're operating with is a memory
+        * type, defined as a combination of NUMA node ID and page size (so that
+        * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
+        *
+        * deciding amount of memory going towards each memory type is a
+        * balancing act between maximum segments per type, maximum memory per
+        * type, and number of detected NUMA nodes. the goal is to make sure
+        * each memory type gets at least one memseg list.
+        *
+        * the total amount of memory is limited by RTE_MAX_MEM_MB value.
+        *
+        * the total amount of memory per type is limited by either
+        * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
+        * of detected NUMA nodes. additionally, maximum number of segments per
+        * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
+        * smaller page sizes, it can take hundreds of thousands of segments to
+        * reach the above specified per-type memory limits.
+        *
+        * additionally, each type may have multiple memseg lists associated
+        * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
+        * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
+        *
+        * the number of memseg lists per type is decided based on the above
+        * limits, and also taking number of detected NUMA nodes, to make sure
+        * that we don't run out of memseg lists before we populate all NUMA
+        * nodes with memory.
+        *
+        * we do this in three stages. first, we collect the number of types.
+        * then, we figure out memory constraints and populate the list of
+        * would-be memseg lists. then, we go ahead and allocate the memseg
+        * lists.
+        */
+
+       /* create space for mem types */
+       n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
+       memtypes = calloc(n_memtypes, sizeof(*memtypes));
+       if (memtypes == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
+               return -1;
+       }
+
+       /* populate mem types */
+       cur_type = 0;
+       for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+                       hpi_idx++) {
+               struct hugepage_info *hpi;
+               uint64_t hugepage_sz;
+
+               hpi = &internal_config.hugepage_info[hpi_idx];
+               hugepage_sz = hpi->hugepage_sz;
+
+               for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
+                       int socket_id = rte_socket_id_by_idx(i);
+
+#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
+                       if (socket_id > 0)
+                               break;
+#endif
+                       memtypes[cur_type].page_sz = hugepage_sz;
+                       memtypes[cur_type].socket_id = socket_id;
+
+                       RTE_LOG(DEBUG, EAL, "Detected memory type: "
+                               "socket_id:%u hugepage_sz:%" PRIu64 "\n",
+                               socket_id, hugepage_sz);
+               }
+       }
+       /* number of memtypes could have been lower due to no NUMA support */
+       n_memtypes = cur_type;
+
+       /* set up limits for types */
+       max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
+       max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
+                       max_mem / n_memtypes);
+       /*
+        * limit maximum number of segment lists per type to ensure there's
+        * space for memseg lists for all NUMA nodes with all page sizes
+        */
+       max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
+
+       if (max_seglists_per_type == 0) {
+               RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
+                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+               goto out;
+       }
+
+       /* go through all mem types and create segment lists */
+       msl_idx = 0;
+       for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
+               unsigned int cur_seglist, n_seglists, n_segs;
+               unsigned int max_segs_per_type, max_segs_per_list;
+               struct memtype *type = &memtypes[cur_type];
+               uint64_t max_mem_per_list, pagesz;
+               int socket_id;
+
+               pagesz = type->page_sz;
+               socket_id = type->socket_id;
+
+               /*
+                * we need to create segment lists for this type. we must take
+                * into account the following things:
+                *
+                * 1. total amount of memory we can use for this memory type
+                * 2. total amount of memory per memseg list allowed
+                * 3. number of segments needed to fit the amount of memory
+                * 4. number of segments allowed per type
+                * 5. number of segments allowed per memseg list
+                * 6. number of memseg lists we are allowed to take up
+                */
+
+               /* calculate how much segments we will need in total */
+               max_segs_per_type = max_mem_per_type / pagesz;
+               /* limit number of segments to maximum allowed per type */
+               max_segs_per_type = RTE_MIN(max_segs_per_type,
+                               (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
+               /* limit number of segments to maximum allowed per list */
+               max_segs_per_list = RTE_MIN(max_segs_per_type,
+                               (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
+
+               /* calculate how much memory we can have per segment list */
+               max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
+                               (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
+
+               /* calculate how many segments each segment list will have */
+               n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
+
+               /* calculate how many segment lists we can have */
+               n_seglists = RTE_MIN(max_segs_per_type / n_segs,
+                               max_mem_per_type / max_mem_per_list);
+
+               /* limit number of segment lists according to our maximum */
+               n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
+
+               RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
+                               "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
+                       n_seglists, n_segs, socket_id, pagesz);
+
+               /* create all segment lists */
+               for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
+                       if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+                               RTE_LOG(ERR, EAL,
+                                       "No more space in memseg lists, please increase %s\n",
+                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
+                               goto out;
+                       }
+                       msl = &mcfg->memsegs[msl_idx++];
+
+                       if (alloc_memseg_list(msl, pagesz, n_segs,
+                                       socket_id, cur_seglist))
+                               goto out;
+
+                       if (alloc_va_space(msl)) {
+                               RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
+                               goto out;
+                       }
+               }
+       }
+       /* we're successful */
+       ret = 0;
+out:
+       free(memtypes);
+       return ret;
+}
+
+static int
+memseg_secondary_init(void)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx = 0;
+       struct rte_memseg_list *msl;
+
+       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+
+               msl = &mcfg->memsegs[msl_idx];
+
+               /* skip empty memseg lists */
+               if (msl->memseg_arr.len == 0)
+                       continue;
+
+               if (rte_fbarray_attach(&msl->memseg_arr)) {
+                       RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
+                       return -1;
+               }
+
+               /* preallocate VA space */
+               if (alloc_va_space(msl)) {
+                       RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+       /* increase rlimit to maximum */
+       struct rlimit lim;
+
+       if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
+               /* set limit to maximum */
+               lim.rlim_cur = lim.rlim_max;
+
+               if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
+                       RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
+                                       strerror(errno));
+               } else {
+                       RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
+                                       PRIu64 "\n",
+                                       (uint64_t)lim.rlim_cur);
+               }
+       } else {
+               RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
+       }
+
+       return rte_eal_process_type() == RTE_PROC_PRIMARY ?
+#ifndef RTE_ARCH_64
+                       memseg_primary_init_32() :
+#else
+                       memseg_primary_init() :
+#endif
+                       memseg_secondary_init();
+}
diff --git a/lib/librte_eal/linux/eal/eal_thread.c b/lib/librte_eal/linux/eal/eal_thread.c
new file mode 100644 (file)
index 0000000..379773b
--- /dev/null
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sched.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+
+#include <rte_debug.h>
+#include <rte_atomic.h>
+#include <rte_launch.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_per_lcore.h>
+#include <rte_eal.h>
+#include <rte_lcore.h>
+
+#include "eal_private.h"
+#include "eal_thread.h"
+
+RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
+RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
+RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
+
+/*
+ * Send a message to a slave lcore identified by slave_id to call a
+ * function f with argument arg. Once the execution is done, the
+ * remote lcore switch in FINISHED state.
+ */
+int
+rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
+{
+       int n;
+       char c = 0;
+       int m2s = lcore_config[slave_id].pipe_master2slave[1];
+       int s2m = lcore_config[slave_id].pipe_slave2master[0];
+
+       if (lcore_config[slave_id].state != WAIT)
+               return -EBUSY;
+
+       lcore_config[slave_id].f = f;
+       lcore_config[slave_id].arg = arg;
+
+       /* send message */
+       n = 0;
+       while (n == 0 || (n < 0 && errno == EINTR))
+               n = write(m2s, &c, 1);
+       if (n < 0)
+               rte_panic("cannot write on configuration pipe\n");
+
+       /* wait ack */
+       do {
+               n = read(s2m, &c, 1);
+       } while (n < 0 && errno == EINTR);
+
+       if (n <= 0)
+               rte_panic("cannot read on configuration pipe\n");
+
+       return 0;
+}
+
+/* set affinity for current EAL thread */
+static int
+eal_thread_set_affinity(void)
+{
+       unsigned lcore_id = rte_lcore_id();
+
+       /* acquire system unique id  */
+       rte_gettid();
+
+       /* update EAL thread core affinity */
+       return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
+}
+
+void eal_thread_init_master(unsigned lcore_id)
+{
+       /* set the lcore ID in per-lcore memory area */
+       RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+       /* set CPU affinity */
+       if (eal_thread_set_affinity() < 0)
+               rte_panic("cannot set affinity\n");
+}
+
+/* main loop of threads */
+__attribute__((noreturn)) void *
+eal_thread_loop(__attribute__((unused)) void *arg)
+{
+       char c;
+       int n, ret;
+       unsigned lcore_id;
+       pthread_t thread_id;
+       int m2s, s2m;
+       char cpuset[RTE_CPU_AFFINITY_STR_LEN];
+
+       thread_id = pthread_self();
+
+       /* retrieve our lcore_id from the configuration structure */
+       RTE_LCORE_FOREACH_SLAVE(lcore_id) {
+               if (thread_id == lcore_config[lcore_id].thread_id)
+                       break;
+       }
+       if (lcore_id == RTE_MAX_LCORE)
+               rte_panic("cannot retrieve lcore id\n");
+
+       m2s = lcore_config[lcore_id].pipe_master2slave[0];
+       s2m = lcore_config[lcore_id].pipe_slave2master[1];
+
+       /* set the lcore ID in per-lcore memory area */
+       RTE_PER_LCORE(_lcore_id) = lcore_id;
+
+       /* set CPU affinity */
+       if (eal_thread_set_affinity() < 0)
+               rte_panic("cannot set affinity\n");
+
+       ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
+
+       RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
+               lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
+
+       /* read on our pipe to get commands */
+       while (1) {
+               void *fct_arg;
+
+               /* wait command */
+               do {
+                       n = read(m2s, &c, 1);
+               } while (n < 0 && errno == EINTR);
+
+               if (n <= 0)
+                       rte_panic("cannot read on configuration pipe\n");
+
+               lcore_config[lcore_id].state = RUNNING;
+
+               /* send ack */
+               n = 0;
+               while (n == 0 || (n < 0 && errno == EINTR))
+                       n = write(s2m, &c, 1);
+               if (n < 0)
+                       rte_panic("cannot write on configuration pipe\n");
+
+               if (lcore_config[lcore_id].f == NULL)
+                       rte_panic("NULL function pointer\n");
+
+               /* call the function and store the return value */
+               fct_arg = lcore_config[lcore_id].arg;
+               ret = lcore_config[lcore_id].f(fct_arg);
+               lcore_config[lcore_id].ret = ret;
+               rte_wmb();
+
+               /* when a service core returns, it should go directly to WAIT
+                * state, because the application will not lcore_wait() for it.
+                */
+               if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
+                       lcore_config[lcore_id].state = WAIT;
+               else
+                       lcore_config[lcore_id].state = FINISHED;
+       }
+
+       /* never reached */
+       /* pthread_exit(NULL); */
+       /* return NULL; */
+}
+
+/* require calling thread tid by gettid() */
+int rte_sys_gettid(void)
+{
+       return (int)syscall(SYS_gettid);
+}
+
+int rte_thread_setname(pthread_t id, const char *name)
+{
+       int ret = ENOSYS;
+#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
+#if __GLIBC_PREREQ(2, 12)
+       ret = pthread_setname_np(id, name);
+#endif
+#endif
+       RTE_SET_USED(id);
+       RTE_SET_USED(name);
+       return -ret;
+}
diff --git a/lib/librte_eal/linux/eal/eal_timer.c b/lib/librte_eal/linux/eal/eal_timer.c
new file mode 100644 (file)
index 0000000..bc8f051
--- /dev/null
@@ -0,0 +1,266 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation.
+ * Copyright(c) 2012-2013 6WIND S.A.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <sys/mman.h>
+#include <sys/queue.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_cycles.h>
+#include <rte_lcore.h>
+#include <rte_memory.h>
+#include <rte_eal.h>
+#include <rte_debug.h>
+
+#include "eal_private.h"
+#include "eal_internal_cfg.h"
+
+enum timer_source eal_timer_source = EAL_TIMER_HPET;
+
+#ifdef RTE_LIBEAL_USE_HPET
+
+#define DEV_HPET "/dev/hpet"
+
+/* Maximum number of counters. */
+#define HPET_TIMER_NUM 3
+
+/* General capabilities register */
+#define CLK_PERIOD_SHIFT     32 /* Clock period shift. */
+#define CLK_PERIOD_MASK      0xffffffff00000000ULL /* Clock period mask. */
+
+/**
+ * HPET timer registers. From the Intel IA-PC HPET (High Precision Event
+ * Timers) Specification.
+ */
+struct eal_hpet_regs {
+       /* Memory-mapped, software visible registers */
+       uint64_t capabilities;      /**< RO General Capabilities Register. */
+       uint64_t reserved0;         /**< Reserved for future use. */
+       uint64_t config;            /**< RW General Configuration Register. */
+       uint64_t reserved1;         /**< Reserved for future use. */
+       uint64_t isr;               /**< RW Clear General Interrupt Status. */
+       uint64_t reserved2[25];     /**< Reserved for future use. */
+       union {
+               uint64_t counter;   /**< RW Main Counter Value Register. */
+               struct {
+                       uint32_t counter_l; /**< RW Main Counter Low. */
+                       uint32_t counter_h; /**< RW Main Counter High. */
+               };
+       };
+       uint64_t reserved3;         /**< Reserved for future use. */
+       struct {
+               uint64_t config;    /**< RW Timer Config and Capability Reg. */
+               uint64_t comp;      /**< RW Timer Comparator Value Register. */
+               uint64_t fsb;       /**< RW FSB Interrupt Route Register. */
+               uint64_t reserved4; /**< Reserved for future use. */
+       } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */
+};
+
+/* Mmap'd hpet registers */
+static volatile struct eal_hpet_regs *eal_hpet = NULL;
+
+/* Period at which the HPET counter increments in
+ * femtoseconds (10^-15 seconds). */
+static uint32_t eal_hpet_resolution_fs = 0;
+
+/* Frequency of the HPET counter in Hz */
+static uint64_t eal_hpet_resolution_hz = 0;
+
+/* Incremented 4 times during one 32bits hpet full count */
+static uint32_t eal_hpet_msb;
+
+static pthread_t msb_inc_thread_id;
+
+/*
+ * This function runs on a specific thread to update a global variable
+ * containing used to process MSB of the HPET (unfortunately, we need
+ * this because hpet is 32 bits by default under linux).
+ */
+static void *
+hpet_msb_inc(__attribute__((unused)) void *arg)
+{
+       uint32_t t;
+
+       while (1) {
+               t = (eal_hpet->counter_l >> 30);
+               if (t != (eal_hpet_msb & 3))
+                       eal_hpet_msb ++;
+               sleep(10);
+       }
+       return NULL;
+}
+
+uint64_t
+rte_get_hpet_hz(void)
+{
+       if(internal_config.no_hpet)
+               rte_panic("Error, HPET called, but no HPET present\n");
+
+       return eal_hpet_resolution_hz;
+}
+
+uint64_t
+rte_get_hpet_cycles(void)
+{
+       uint32_t t, msb;
+       uint64_t ret;
+
+       if(internal_config.no_hpet)
+               rte_panic("Error, HPET called, but no HPET present\n");
+
+       t = eal_hpet->counter_l;
+       msb = eal_hpet_msb;
+       ret = (msb + 2 - (t >> 30)) / 4;
+       ret <<= 32;
+       ret += t;
+       return ret;
+}
+
+#endif
+
+#ifdef RTE_LIBEAL_USE_HPET
+/*
+ * Open and mmap /dev/hpet (high precision event timer) that will
+ * provide our time reference.
+ */
+int
+rte_eal_hpet_init(int make_default)
+{
+       int fd, ret;
+
+       if (internal_config.no_hpet) {
+               RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
+               return -1;
+       }
+
+       fd = open(DEV_HPET, O_RDONLY);
+       if (fd < 0) {
+               RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n",
+                       strerror(errno));
+               internal_config.no_hpet = 1;
+               return -1;
+       }
+       eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0);
+       if (eal_hpet == MAP_FAILED) {
+               RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n"
+                               "Please enable CONFIG_HPET_MMAP in your kernel configuration "
+                               "to allow HPET support.\n"
+                               "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n "
+                               "in your build configuration or use '--no-hpet' EAL flag.\n");
+               close(fd);
+               internal_config.no_hpet = 1;
+               return -1;
+       }
+       close(fd);
+
+       eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities &
+                                       CLK_PERIOD_MASK) >>
+                                       CLK_PERIOD_SHIFT);
+
+       eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) /
+               (uint64_t)eal_hpet_resolution_fs;
+
+       RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n",
+                       eal_hpet_resolution_hz/1000);
+
+       eal_hpet_msb = (eal_hpet->counter_l >> 30);
+
+       /* create a thread that will increment a global variable for
+        * msb (hpet is 32 bits by default under linux) */
+       ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
+                                    hpet_msb_inc, NULL);
+       if (ret != 0) {
+               RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
+               internal_config.no_hpet = 1;
+               return -1;
+       }
+
+       if (make_default)
+               eal_timer_source = EAL_TIMER_HPET;
+       return 0;
+}
+#endif
+
+static void
+check_tsc_flags(void)
+{
+       char line[512];
+       FILE *stream;
+
+       stream = fopen("/proc/cpuinfo", "r");
+       if (!stream) {
+               RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n");
+               return;
+       }
+
+       while (fgets(line, sizeof line, stream)) {
+               char *constant_tsc;
+               char *nonstop_tsc;
+
+               if (strncmp(line, "flags", 5) != 0)
+                       continue;
+
+               constant_tsc = strstr(line, "constant_tsc");
+               nonstop_tsc = strstr(line, "nonstop_tsc");
+               if (!constant_tsc || !nonstop_tsc)
+                       RTE_LOG(WARNING, EAL,
+                               "WARNING: cpu flags "
+                               "constant_tsc=%s "
+                               "nonstop_tsc=%s "
+                               "-> using unreliable clock cycles !\n",
+                               constant_tsc ? "yes":"no",
+                               nonstop_tsc ? "yes":"no");
+               break;
+       }
+
+       fclose(stream);
+}
+
+uint64_t
+get_tsc_freq(void)
+{
+#ifdef CLOCK_MONOTONIC_RAW
+#define NS_PER_SEC 1E9
+
+       struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */
+
+       struct timespec t_start, t_end;
+       uint64_t tsc_hz;
+
+       if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) {
+               uint64_t ns, end, start = rte_rdtsc();
+               nanosleep(&sleeptime,NULL);
+               clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
+               end = rte_rdtsc();
+               ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC);
+               ns += (t_end.tv_nsec - t_start.tv_nsec);
+
+               double secs = (double)ns/NS_PER_SEC;
+               tsc_hz = (uint64_t)((end - start)/secs);
+               return tsc_hz;
+       }
+#endif
+       return 0;
+}
+
+int
+rte_eal_timer_init(void)
+{
+
+       eal_timer_source = EAL_TIMER_TSC;
+
+       set_tsc_freq();
+       check_tsc_flags();
+       return 0;
+}
diff --git a/lib/librte_eal/linux/eal/eal_vfio.c b/lib/librte_eal/linux/eal/eal_vfio.c
new file mode 100644 (file)
index 0000000..c821e83
--- /dev/null
@@ -0,0 +1,2049 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <inttypes.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+
+#include <rte_errno.h>
+#include <rte_log.h>
+#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
+#include <rte_vfio.h>
+
+#include "eal_filesystem.h"
+#include "eal_vfio.h"
+#include "eal_private.h"
+
+#ifdef VFIO_PRESENT
+
+#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
+
+/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
+ * recreate the mappings for DPDK segments, but we cannot do so for memory that
+ * was registered by the user themselves, so we need to store the user mappings
+ * somewhere, to recreate them later.
+ */
+#define VFIO_MAX_USER_MEM_MAPS 256
+struct user_mem_map {
+       uint64_t addr;
+       uint64_t iova;
+       uint64_t len;
+};
+
+struct user_mem_maps {
+       rte_spinlock_recursive_t lock;
+       int n_maps;
+       struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
+};
+
+struct vfio_config {
+       int vfio_enabled;
+       int vfio_container_fd;
+       int vfio_active_groups;
+       const struct vfio_iommu_type *vfio_iommu_type;
+       struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
+       struct user_mem_maps mem_maps;
+};
+
+/* per-process VFIO config */
+static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
+static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
+
+static int vfio_type1_dma_map(int);
+static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_spapr_dma_map(int);
+static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_noiommu_dma_map(int);
+static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
+static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
+               uint64_t iova, uint64_t len, int do_map);
+
+/* IOMMU types we support */
+static const struct vfio_iommu_type iommu_types[] = {
+       /* x86 IOMMU, otherwise known as type 1 */
+       {
+               .type_id = RTE_VFIO_TYPE1,
+               .name = "Type 1",
+               .dma_map_func = &vfio_type1_dma_map,
+               .dma_user_map_func = &vfio_type1_dma_mem_map
+       },
+       /* ppc64 IOMMU, otherwise known as spapr */
+       {
+               .type_id = RTE_VFIO_SPAPR,
+               .name = "sPAPR",
+               .dma_map_func = &vfio_spapr_dma_map,
+               .dma_user_map_func = &vfio_spapr_dma_mem_map
+       },
+       /* IOMMU-less mode */
+       {
+               .type_id = RTE_VFIO_NOIOMMU,
+               .name = "No-IOMMU",
+               .dma_map_func = &vfio_noiommu_dma_map,
+               .dma_user_map_func = &vfio_noiommu_dma_mem_map
+       },
+};
+
+static int
+is_null_map(const struct user_mem_map *map)
+{
+       return map->addr == 0 && map->iova == 0 && map->len == 0;
+}
+
+/* we may need to merge user mem maps together in case of user mapping/unmapping
+ * chunks of memory, so we'll need a comparator function to sort segments.
+ */
+static int
+user_mem_map_cmp(const void *a, const void *b)
+{
+       const struct user_mem_map *umm_a = a;
+       const struct user_mem_map *umm_b = b;
+
+       /* move null entries to end */
+       if (is_null_map(umm_a))
+               return 1;
+       if (is_null_map(umm_b))
+               return -1;
+
+       /* sort by iova first */
+       if (umm_a->iova < umm_b->iova)
+               return -1;
+       if (umm_a->iova > umm_b->iova)
+               return 1;
+
+       if (umm_a->addr < umm_b->addr)
+               return -1;
+       if (umm_a->addr > umm_b->addr)
+               return 1;
+
+       if (umm_a->len < umm_b->len)
+               return -1;
+       if (umm_a->len > umm_b->len)
+               return 1;
+
+       return 0;
+}
+
+/* adjust user map entry. this may result in shortening of existing map, or in
+ * splitting existing map in two pieces.
+ */
+static void
+adjust_map(struct user_mem_map *src, struct user_mem_map *end,
+               uint64_t remove_va_start, uint64_t remove_len)
+{
+       /* if va start is same as start address, we're simply moving start */
+       if (remove_va_start == src->addr) {
+               src->addr += remove_len;
+               src->iova += remove_len;
+               src->len -= remove_len;
+       } else if (remove_va_start + remove_len == src->addr + src->len) {
+               /* we're shrinking mapping from the end */
+               src->len -= remove_len;
+       } else {
+               /* we're blowing a hole in the middle */
+               struct user_mem_map tmp;
+               uint64_t total_len = src->len;
+
+               /* adjust source segment length */
+               src->len = remove_va_start - src->addr;
+
+               /* create temporary segment in the middle */
+               tmp.addr = src->addr + src->len;
+               tmp.iova = src->iova + src->len;
+               tmp.len = remove_len;
+
+               /* populate end segment - this one we will be keeping */
+               end->addr = tmp.addr + tmp.len;
+               end->iova = tmp.iova + tmp.len;
+               end->len = total_len - src->len - tmp.len;
+       }
+}
+
+/* try merging two maps into one, return 1 if succeeded */
+static int
+merge_map(struct user_mem_map *left, struct user_mem_map *right)
+{
+       if (left->addr + left->len != right->addr)
+               return 0;
+       if (left->iova + left->len != right->iova)
+               return 0;
+
+       left->len += right->len;
+
+       memset(right, 0, sizeof(*right));
+
+       return 1;
+}
+
+static struct user_mem_map *
+find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
+               uint64_t iova, uint64_t len)
+{
+       uint64_t va_end = addr + len;
+       uint64_t iova_end = iova + len;
+       int i;
+
+       for (i = 0; i < user_mem_maps->n_maps; i++) {
+               struct user_mem_map *map = &user_mem_maps->maps[i];
+               uint64_t map_va_end = map->addr + map->len;
+               uint64_t map_iova_end = map->iova + map->len;
+
+               /* check start VA */
+               if (addr < map->addr || addr >= map_va_end)
+                       continue;
+               /* check if VA end is within boundaries */
+               if (va_end <= map->addr || va_end > map_va_end)
+                       continue;
+
+               /* check start IOVA */
+               if (iova < map->iova || iova >= map_iova_end)
+                       continue;
+               /* check if IOVA end is within boundaries */
+               if (iova_end <= map->iova || iova_end > map_iova_end)
+                       continue;
+
+               /* we've found our map */
+               return map;
+       }
+       return NULL;
+}
+
+/* this will sort all user maps, and merge/compact any adjacent maps */
+static void
+compact_user_maps(struct user_mem_maps *user_mem_maps)
+{
+       int i, n_merged, cur_idx;
+
+       qsort(user_mem_maps->maps, user_mem_maps->n_maps,
+                       sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
+
+       /* we'll go over the list backwards when merging */
+       n_merged = 0;
+       for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
+               struct user_mem_map *l, *r;
+
+               l = &user_mem_maps->maps[i];
+               r = &user_mem_maps->maps[i + 1];
+
+               if (is_null_map(l) || is_null_map(r))
+                       continue;
+
+               if (merge_map(l, r))
+                       n_merged++;
+       }
+
+       /* the entries are still sorted, but now they have holes in them, so
+        * walk through the list and remove the holes
+        */
+       if (n_merged > 0) {
+               cur_idx = 0;
+               for (i = 0; i < user_mem_maps->n_maps; i++) {
+                       if (!is_null_map(&user_mem_maps->maps[i])) {
+                               struct user_mem_map *src, *dst;
+
+                               src = &user_mem_maps->maps[i];
+                               dst = &user_mem_maps->maps[cur_idx++];
+
+                               if (src != dst) {
+                                       memcpy(dst, src, sizeof(*src));
+                                       memset(src, 0, sizeof(*src));
+                               }
+                       }
+               }
+               user_mem_maps->n_maps = cur_idx;
+       }
+}
+
+static int
+vfio_open_group_fd(int iommu_group_num)
+{
+       int vfio_group_fd;
+       char filename[PATH_MAX];
+       struct rte_mp_msg mp_req, *mp_rep;
+       struct rte_mp_reply mp_reply;
+       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+       /* if primary, try to open the group */
+       if (internal_config.process_type == RTE_PROC_PRIMARY) {
+               /* try regular group format */
+               snprintf(filename, sizeof(filename),
+                                VFIO_GROUP_FMT, iommu_group_num);
+               vfio_group_fd = open(filename, O_RDWR);
+               if (vfio_group_fd < 0) {
+                       /* if file not found, it's not an error */
+                       if (errno != ENOENT) {
+                               RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+                                               strerror(errno));
+                               return -1;
+                       }
+
+                       /* special case: try no-IOMMU path as well */
+                       snprintf(filename, sizeof(filename),
+                                       VFIO_NOIOMMU_GROUP_FMT,
+                                       iommu_group_num);
+                       vfio_group_fd = open(filename, O_RDWR);
+                       if (vfio_group_fd < 0) {
+                               if (errno != ENOENT) {
+                                       RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
+                                                       strerror(errno));
+                                       return -1;
+                               }
+                               return 0;
+                       }
+                       /* noiommu group found */
+               }
+
+               return vfio_group_fd;
+       }
+       /* if we're in a secondary process, request group fd from the primary
+        * process via mp channel.
+        */
+       p->req = SOCKET_REQ_GROUP;
+       p->group_num = iommu_group_num;
+       strcpy(mp_req.name, EAL_VFIO_MP);
+       mp_req.len_param = sizeof(*p);
+       mp_req.num_fds = 0;
+
+       vfio_group_fd = -1;
+       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+           mp_reply.nb_received == 1) {
+               mp_rep = &mp_reply.msgs[0];
+               p = (struct vfio_mp_param *)mp_rep->param;
+               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+                       vfio_group_fd = mp_rep->fds[0];
+               } else if (p->result == SOCKET_NO_FD) {
+                       RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
+                       vfio_group_fd = 0;
+               }
+               free(mp_reply.msgs);
+       }
+
+       if (vfio_group_fd < 0)
+               RTE_LOG(ERR, EAL, "  cannot request group fd\n");
+       return vfio_group_fd;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_num(int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+       int i, j;
+
+       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+               vfio_cfg = &vfio_cfgs[i];
+               for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+                       if (vfio_cfg->vfio_groups[j].group_num ==
+                                       iommu_group_num)
+                               return vfio_cfg;
+               }
+       }
+
+       return NULL;
+}
+
+static int
+vfio_get_group_fd(struct vfio_config *vfio_cfg,
+               int iommu_group_num)
+{
+       int i;
+       int vfio_group_fd;
+       struct vfio_group *cur_grp;
+
+       /* check if we already have the group descriptor open */
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
+                       return vfio_cfg->vfio_groups[i].fd;
+
+       /* Lets see first if there is room for a new group */
+       if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+               return -1;
+       }
+
+       /* Now lets get an index for the new group */
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg->vfio_groups[i].group_num == -1) {
+                       cur_grp = &vfio_cfg->vfio_groups[i];
+                       break;
+               }
+
+       /* This should not happen */
+       if (i == VFIO_MAX_GROUPS) {
+               RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+               return -1;
+       }
+
+       vfio_group_fd = vfio_open_group_fd(iommu_group_num);
+       if (vfio_group_fd < 0) {
+               RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
+               return -1;
+       }
+
+       cur_grp->group_num = iommu_group_num;
+       cur_grp->fd = vfio_group_fd;
+       vfio_cfg->vfio_active_groups++;
+
+       return vfio_group_fd;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_group_fd(int vfio_group_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i, j;
+
+       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+               vfio_cfg = &vfio_cfgs[i];
+               for (j = 0; j < VFIO_MAX_GROUPS; j++)
+                       if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+                               return vfio_cfg;
+       }
+
+       return NULL;
+}
+
+static struct vfio_config *
+get_vfio_cfg_by_container_fd(int container_fd)
+{
+       int i;
+
+       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+               if (vfio_cfgs[i].vfio_container_fd == container_fd)
+                       return &vfio_cfgs[i];
+       }
+
+       return NULL;
+}
+
+int
+rte_vfio_get_group_fd(int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+
+       /* get the vfio_config it belongs to */
+       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+       return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
+static int
+get_vfio_group_idx(int vfio_group_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i, j;
+
+       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+               vfio_cfg = &vfio_cfgs[i];
+               for (j = 0; j < VFIO_MAX_GROUPS; j++)
+                       if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
+                               return j;
+       }
+
+       return -1;
+}
+
+static void
+vfio_group_device_get(int vfio_group_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+               return;
+       }
+
+       i = get_vfio_group_idx(vfio_group_fd);
+       if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
+               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
+       else
+               vfio_cfg->vfio_groups[i].devices++;
+}
+
+static void
+vfio_group_device_put(int vfio_group_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+               return;
+       }
+
+       i = get_vfio_group_idx(vfio_group_fd);
+       if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
+               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
+       else
+               vfio_cfg->vfio_groups[i].devices--;
+}
+
+static int
+vfio_group_device_count(int vfio_group_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+               return -1;
+       }
+
+       i = get_vfio_group_idx(vfio_group_fd);
+       if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
+               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
+               return -1;
+       }
+
+       return vfio_cfg->vfio_groups[i].devices;
+}
+
+static void
+vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
+               void *arg __rte_unused)
+{
+       struct rte_memseg_list *msl;
+       struct rte_memseg *ms;
+       size_t cur_len = 0;
+
+       msl = rte_mem_virt2memseg_list(addr);
+
+       /* for IOVA as VA mode, no need to care for IOVA addresses */
+       if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
+               uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
+               if (type == RTE_MEM_EVENT_ALLOC)
+                       vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+                                       len, 1);
+               else
+                       vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
+                                       len, 0);
+               return;
+       }
+
+       /* memsegs are contiguous in memory */
+       ms = rte_mem_virt2memseg(addr, msl);
+       while (cur_len < len) {
+               /* some memory segments may have invalid IOVA */
+               if (ms->iova == RTE_BAD_IOVA) {
+                       RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
+                                       ms->addr);
+                       goto next;
+               }
+               if (type == RTE_MEM_EVENT_ALLOC)
+                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+                                       ms->iova, ms->len, 1);
+               else
+                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
+                                       ms->iova, ms->len, 0);
+next:
+               cur_len += ms->len;
+               ++ms;
+       }
+}
+
+static int
+vfio_sync_default_container(void)
+{
+       struct rte_mp_msg mp_req, *mp_rep;
+       struct rte_mp_reply mp_reply;
+       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+       int iommu_type_id;
+       unsigned int i;
+
+       /* cannot be called from primary */
+       if (rte_eal_process_type() != RTE_PROC_SECONDARY)
+               return -1;
+
+       /* default container fd should have been opened in rte_vfio_enable() */
+       if (!default_vfio_cfg->vfio_enabled ||
+                       default_vfio_cfg->vfio_container_fd < 0) {
+               RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
+               return -1;
+       }
+
+       /* find default container's IOMMU type */
+       p->req = SOCKET_REQ_IOMMU_TYPE;
+       strcpy(mp_req.name, EAL_VFIO_MP);
+       mp_req.len_param = sizeof(*p);
+       mp_req.num_fds = 0;
+
+       iommu_type_id = -1;
+       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+                       mp_reply.nb_received == 1) {
+               mp_rep = &mp_reply.msgs[0];
+               p = (struct vfio_mp_param *)mp_rep->param;
+               if (p->result == SOCKET_OK)
+                       iommu_type_id = p->iommu_type_id;
+               free(mp_reply.msgs);
+       }
+       if (iommu_type_id < 0) {
+               RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
+               return -1;
+       }
+
+       /* we now have an fd for default container, as well as its IOMMU type.
+        * now, set up default VFIO container config to match.
+        */
+       for (i = 0; i < RTE_DIM(iommu_types); i++) {
+               const struct vfio_iommu_type *t = &iommu_types[i];
+               if (t->type_id != iommu_type_id)
+                       continue;
+
+               /* we found our IOMMU type */
+               default_vfio_cfg->vfio_iommu_type = t;
+
+               return 0;
+       }
+       RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
+                       iommu_type_id);
+       return -1;
+}
+
+int
+rte_vfio_clear_group(int vfio_group_fd)
+{
+       int i;
+       struct vfio_config *vfio_cfg;
+
+       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
+               return -1;
+       }
+
+       i = get_vfio_group_idx(vfio_group_fd);
+       if (i < 0)
+               return -1;
+       vfio_cfg->vfio_groups[i].group_num = -1;
+       vfio_cfg->vfio_groups[i].fd = -1;
+       vfio_cfg->vfio_groups[i].devices = 0;
+       vfio_cfg->vfio_active_groups--;
+
+       return 0;
+}
+
+int
+rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+               int *vfio_dev_fd, struct vfio_device_info *device_info)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
+       struct vfio_group_status group_status = {
+                       .argsz = sizeof(group_status)
+       };
+       struct vfio_config *vfio_cfg;
+       struct user_mem_maps *user_mem_maps;
+       int vfio_container_fd;
+       int vfio_group_fd;
+       int iommu_group_num;
+       int i, ret;
+
+       /* get group number */
+       ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
+       if (ret == 0) {
+               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+                       dev_addr);
+               return 1;
+       }
+
+       /* if negative, something failed */
+       if (ret < 0)
+               return -1;
+
+       /* get the actual group fd */
+       vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+       if (vfio_group_fd < 0)
+               return -1;
+
+       /* if group_fd == 0, that means the device isn't managed by VFIO */
+       if (vfio_group_fd == 0) {
+               RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+                               dev_addr);
+               return 1;
+       }
+
+       /*
+        * at this point, we know that this group is viable (meaning, all devices
+        * are either bound to VFIO or not bound to anything)
+        */
+
+       /* check if the group is viable */
+       ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  %s cannot get group status, "
+                               "error %i (%s)\n", dev_addr, errno, strerror(errno));
+               close(vfio_group_fd);
+               rte_vfio_clear_group(vfio_group_fd);
+               return -1;
+       } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+               RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", dev_addr);
+               close(vfio_group_fd);
+               rte_vfio_clear_group(vfio_group_fd);
+               return -1;
+       }
+
+       /* get the vfio_config it belongs to */
+       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+       vfio_container_fd = vfio_cfg->vfio_container_fd;
+       user_mem_maps = &vfio_cfg->mem_maps;
+
+       /* check if group does not have a container yet */
+       if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
+
+               /* add group to a container */
+               ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
+                               &vfio_container_fd);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
+                                       "error %i (%s)\n", dev_addr, errno, strerror(errno));
+                       close(vfio_group_fd);
+                       rte_vfio_clear_group(vfio_group_fd);
+                       return -1;
+               }
+
+               /*
+                * pick an IOMMU type and set up DMA mappings for container
+                *
+                * needs to be done only once, only when first group is
+                * assigned to a container and only in primary process.
+                * Note this can happen several times with the hotplug
+                * functionality.
+                */
+               if (internal_config.process_type == RTE_PROC_PRIMARY &&
+                               vfio_cfg->vfio_active_groups == 1 &&
+                               vfio_group_device_count(vfio_group_fd) == 0) {
+                       const struct vfio_iommu_type *t;
+
+                       /* select an IOMMU type which we will be using */
+                       t = vfio_set_iommu_type(vfio_container_fd);
+                       if (!t) {
+                               RTE_LOG(ERR, EAL,
+                                       "  %s failed to select IOMMU type\n",
+                                       dev_addr);
+                               close(vfio_group_fd);
+                               rte_vfio_clear_group(vfio_group_fd);
+                               return -1;
+                       }
+                       /* lock memory hotplug before mapping and release it
+                        * after registering callback, to prevent races
+                        */
+                       rte_rwlock_read_lock(mem_lock);
+                       if (vfio_cfg == default_vfio_cfg)
+                               ret = t->dma_map_func(vfio_container_fd);
+                       else
+                               ret = 0;
+                       if (ret) {
+                               RTE_LOG(ERR, EAL,
+                                       "  %s DMA remapping failed, error %i (%s)\n",
+                                       dev_addr, errno, strerror(errno));
+                               close(vfio_group_fd);
+                               rte_vfio_clear_group(vfio_group_fd);
+                               rte_rwlock_read_unlock(mem_lock);
+                               return -1;
+                       }
+
+                       vfio_cfg->vfio_iommu_type = t;
+
+                       /* re-map all user-mapped segments */
+                       rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+                       /* this IOMMU type may not support DMA mapping, but
+                        * if we have mappings in the list - that means we have
+                        * previously mapped something successfully, so we can
+                        * be sure that DMA mapping is supported.
+                        */
+                       for (i = 0; i < user_mem_maps->n_maps; i++) {
+                               struct user_mem_map *map;
+                               map = &user_mem_maps->maps[i];
+
+                               ret = t->dma_user_map_func(
+                                               vfio_container_fd,
+                                               map->addr, map->iova, map->len,
+                                               1);
+                               if (ret) {
+                                       RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
+                                                       "va: 0x%" PRIx64 " "
+                                                       "iova: 0x%" PRIx64 " "
+                                                       "len: 0x%" PRIu64 "\n",
+                                                       map->addr, map->iova,
+                                                       map->len);
+                                       rte_spinlock_recursive_unlock(
+                                                       &user_mem_maps->lock);
+                                       rte_rwlock_read_unlock(mem_lock);
+                                       return -1;
+                               }
+                       }
+                       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+
+                       /* register callback for mem events */
+                       if (vfio_cfg == default_vfio_cfg)
+                               ret = rte_mem_event_callback_register(
+                                       VFIO_MEM_EVENT_CLB_NAME,
+                                       vfio_mem_event_callback, NULL);
+                       else
+                               ret = 0;
+                       /* unlock memory hotplug */
+                       rte_rwlock_read_unlock(mem_lock);
+
+                       if (ret && rte_errno != ENOTSUP) {
+                               RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
+                               return -1;
+                       }
+                       if (ret)
+                               RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
+                       else
+                               RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
+               }
+       } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
+                       vfio_cfg == default_vfio_cfg &&
+                       vfio_cfg->vfio_iommu_type == NULL) {
+               /* if we're not a primary process, we do not set up the VFIO
+                * container because it's already been set up by the primary
+                * process. instead, we simply ask the primary about VFIO type
+                * we are using, and set the VFIO config up appropriately.
+                */
+               ret = vfio_sync_default_container();
+               if (ret < 0) {
+                       RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
+                       close(vfio_group_fd);
+                       rte_vfio_clear_group(vfio_group_fd);
+                       return -1;
+               }
+               /* we have successfully initialized VFIO, notify user */
+               const struct vfio_iommu_type *t =
+                               default_vfio_cfg->vfio_iommu_type;
+               RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+                               t->type_id, t->name);
+       }
+
+       /* get a file descriptor for the device */
+       *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
+       if (*vfio_dev_fd < 0) {
+               /* if we cannot get a device fd, this implies a problem with
+                * the VFIO group or the container not having IOMMU configured.
+                */
+
+               RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
+                               dev_addr);
+               close(vfio_group_fd);
+               rte_vfio_clear_group(vfio_group_fd);
+               return -1;
+       }
+
+       /* test and setup the device */
+       ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  %s cannot get device info, "
+                               "error %i (%s)\n", dev_addr, errno,
+                               strerror(errno));
+               close(*vfio_dev_fd);
+               close(vfio_group_fd);
+               rte_vfio_clear_group(vfio_group_fd);
+               return -1;
+       }
+       vfio_group_device_get(vfio_group_fd);
+
+       return 0;
+}
+
+int
+rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
+                   int vfio_dev_fd)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
+       struct vfio_group_status group_status = {
+                       .argsz = sizeof(group_status)
+       };
+       struct vfio_config *vfio_cfg;
+       int vfio_group_fd;
+       int iommu_group_num;
+       int ret;
+
+       /* we don't want any DMA mapping messages to come while we're detaching
+        * VFIO device, because this might be the last device and we might need
+        * to unregister the callback.
+        */
+       rte_rwlock_read_lock(mem_lock);
+
+       /* get group number */
+       ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
+       if (ret <= 0) {
+               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver\n",
+                       dev_addr);
+               /* This is an error at this point. */
+               ret = -1;
+               goto out;
+       }
+
+       /* get the actual group fd */
+       vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
+       if (vfio_group_fd <= 0) {
+               RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
+                                  dev_addr);
+               ret = -1;
+               goto out;
+       }
+
+       /* get the vfio_config it belongs to */
+       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
+       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
+
+       /* At this point we got an active group. Closing it will make the
+        * container detachment. If this is the last active group, VFIO kernel
+        * code will unset the container and the IOMMU mappings.
+        */
+
+       /* Closing a device */
+       if (close(vfio_dev_fd) < 0) {
+               RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+                                  dev_addr);
+               ret = -1;
+               goto out;
+       }
+
+       /* An VFIO group can have several devices attached. Just when there is
+        * no devices remaining should the group be closed.
+        */
+       vfio_group_device_put(vfio_group_fd);
+       if (!vfio_group_device_count(vfio_group_fd)) {
+
+               if (close(vfio_group_fd) < 0) {
+                       RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+                               dev_addr);
+                       ret = -1;
+                       goto out;
+               }
+
+               if (rte_vfio_clear_group(vfio_group_fd) < 0) {
+                       RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+                                          dev_addr);
+                       ret = -1;
+                       goto out;
+               }
+       }
+
+       /* if there are no active device groups, unregister the callback to
+        * avoid spurious attempts to map/unmap memory from VFIO.
+        */
+       if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
+                       rte_eal_process_type() != RTE_PROC_SECONDARY)
+               rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
+                               NULL);
+
+       /* success */
+       ret = 0;
+
+out:
+       rte_rwlock_read_unlock(mem_lock);
+       return ret;
+}
+
+int
+rte_vfio_enable(const char *modname)
+{
+       /* initialize group list */
+       int i, j;
+       int vfio_available;
+
+       rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
+
+       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
+               vfio_cfgs[i].vfio_container_fd = -1;
+               vfio_cfgs[i].vfio_active_groups = 0;
+               vfio_cfgs[i].vfio_iommu_type = NULL;
+               vfio_cfgs[i].mem_maps.lock = lock;
+
+               for (j = 0; j < VFIO_MAX_GROUPS; j++) {
+                       vfio_cfgs[i].vfio_groups[j].fd = -1;
+                       vfio_cfgs[i].vfio_groups[j].group_num = -1;
+                       vfio_cfgs[i].vfio_groups[j].devices = 0;
+               }
+       }
+
+       /* inform the user that we are probing for VFIO */
+       RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
+
+       /* check if vfio module is loaded */
+       vfio_available = rte_eal_check_module(modname);
+
+       /* return error directly */
+       if (vfio_available == -1) {
+               RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
+               return -1;
+       }
+
+       /* return 0 if VFIO modules not loaded */
+       if (vfio_available == 0) {
+               RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
+                       "skipping VFIO support...\n");
+               return 0;
+       }
+
+       if (internal_config.process_type == RTE_PROC_PRIMARY) {
+               /* open a new container */
+               default_vfio_cfg->vfio_container_fd =
+                               rte_vfio_get_container_fd();
+       } else {
+               /* get the default container from the primary process */
+               default_vfio_cfg->vfio_container_fd =
+                               vfio_get_default_container_fd();
+       }
+
+       /* check if we have VFIO driver enabled */
+       if (default_vfio_cfg->vfio_container_fd != -1) {
+               RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
+               default_vfio_cfg->vfio_enabled = 1;
+       } else {
+               RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
+       }
+
+       return 0;
+}
+
+int
+rte_vfio_is_enabled(const char *modname)
+{
+       const int mod_available = rte_eal_check_module(modname) > 0;
+       return default_vfio_cfg->vfio_enabled && mod_available;
+}
+
+int
+vfio_get_default_container_fd(void)
+{
+       struct rte_mp_msg mp_req, *mp_rep;
+       struct rte_mp_reply mp_reply;
+       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+       if (default_vfio_cfg->vfio_enabled)
+               return default_vfio_cfg->vfio_container_fd;
+
+       if (internal_config.process_type == RTE_PROC_PRIMARY) {
+               /* if we were secondary process we would try requesting
+                * container fd from the primary, but we're the primary
+                * process so just exit here
+                */
+               return -1;
+       }
+
+       p->req = SOCKET_REQ_DEFAULT_CONTAINER;
+       strcpy(mp_req.name, EAL_VFIO_MP);
+       mp_req.len_param = sizeof(*p);
+       mp_req.num_fds = 0;
+
+       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+           mp_reply.nb_received == 1) {
+               mp_rep = &mp_reply.msgs[0];
+               p = (struct vfio_mp_param *)mp_rep->param;
+               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+                       free(mp_reply.msgs);
+                       return mp_rep->fds[0];
+               }
+               free(mp_reply.msgs);
+       }
+
+       RTE_LOG(ERR, EAL, "  cannot request default container fd\n");
+       return -1;
+}
+
+int
+vfio_get_iommu_type(void)
+{
+       if (default_vfio_cfg->vfio_iommu_type == NULL)
+               return -1;
+
+       return default_vfio_cfg->vfio_iommu_type->type_id;
+}
+
+const struct vfio_iommu_type *
+vfio_set_iommu_type(int vfio_container_fd)
+{
+       unsigned idx;
+       for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+               const struct vfio_iommu_type *t = &iommu_types[idx];
+
+               int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
+                               t->type_id);
+               if (!ret) {
+                       RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
+                                       t->type_id, t->name);
+                       return t;
+               }
+               /* not an error, there may be more supported IOMMU types */
+               RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
+                               "error %i (%s)\n", t->type_id, t->name, errno,
+                               strerror(errno));
+       }
+       /* if we didn't find a suitable IOMMU type, fail */
+       return NULL;
+}
+
+int
+vfio_has_supported_extensions(int vfio_container_fd)
+{
+       int ret;
+       unsigned idx, n_extensions = 0;
+       for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
+               const struct vfio_iommu_type *t = &iommu_types[idx];
+
+               ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
+                               t->type_id);
+               if (ret < 0) {
+                       RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
+                               "error %i (%s)\n", errno,
+                               strerror(errno));
+                       close(vfio_container_fd);
+                       return -1;
+               } else if (ret == 1) {
+                       /* we found a supported extension */
+                       n_extensions++;
+               }
+               RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
+                               t->type_id, t->name,
+                               ret ? "supported" : "not supported");
+       }
+
+       /* if we didn't find any supported IOMMU types, fail */
+       if (!n_extensions) {
+               close(vfio_container_fd);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+rte_vfio_get_container_fd(void)
+{
+       int ret, vfio_container_fd;
+       struct rte_mp_msg mp_req, *mp_rep;
+       struct rte_mp_reply mp_reply;
+       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
+       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
+
+
+       /* if we're in a primary process, try to open the container */
+       if (internal_config.process_type == RTE_PROC_PRIMARY) {
+               vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
+               if (vfio_container_fd < 0) {
+                       RTE_LOG(ERR, EAL, "  cannot open VFIO container, "
+                                       "error %i (%s)\n", errno, strerror(errno));
+                       return -1;
+               }
+
+               /* check VFIO API version */
+               ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
+               if (ret != VFIO_API_VERSION) {
+                       if (ret < 0)
+                               RTE_LOG(ERR, EAL, "  could not get VFIO API version, "
+                                               "error %i (%s)\n", errno, strerror(errno));
+                       else
+                               RTE_LOG(ERR, EAL, "  unsupported VFIO API version!\n");
+                       close(vfio_container_fd);
+                       return -1;
+               }
+
+               ret = vfio_has_supported_extensions(vfio_container_fd);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  no supported IOMMU "
+                                       "extensions found!\n");
+                       return -1;
+               }
+
+               return vfio_container_fd;
+       }
+       /*
+        * if we're in a secondary process, request container fd from the
+        * primary process via mp channel
+        */
+       p->req = SOCKET_REQ_CONTAINER;
+       strcpy(mp_req.name, EAL_VFIO_MP);
+       mp_req.len_param = sizeof(*p);
+       mp_req.num_fds = 0;
+
+       vfio_container_fd = -1;
+       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
+           mp_reply.nb_received == 1) {
+               mp_rep = &mp_reply.msgs[0];
+               p = (struct vfio_mp_param *)mp_rep->param;
+               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
+                       vfio_container_fd = mp_rep->fds[0];
+                       free(mp_reply.msgs);
+                       return vfio_container_fd;
+               }
+               free(mp_reply.msgs);
+       }
+
+       RTE_LOG(ERR, EAL, "  cannot request container fd\n");
+       return -1;
+}
+
+int
+rte_vfio_get_group_num(const char *sysfs_base,
+               const char *dev_addr, int *iommu_group_num)
+{
+       char linkname[PATH_MAX];
+       char filename[PATH_MAX];
+       char *tok[16], *group_tok, *end;
+       int ret;
+
+       memset(linkname, 0, sizeof(linkname));
+       memset(filename, 0, sizeof(filename));
+
+       /* try to find out IOMMU group for this device */
+       snprintf(linkname, sizeof(linkname),
+                        "%s/%s/iommu_group", sysfs_base, dev_addr);
+
+       ret = readlink(linkname, filename, sizeof(filename));
+
+       /* if the link doesn't exist, no VFIO for us */
+       if (ret < 0)
+               return 0;
+
+       ret = rte_strsplit(filename, sizeof(filename),
+                       tok, RTE_DIM(tok), '/');
+
+       if (ret <= 0) {
+               RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", dev_addr);
+               return -1;
+       }
+
+       /* IOMMU group is always the last token */
+       errno = 0;
+       group_tok = tok[ret - 1];
+       end = group_tok;
+       *iommu_group_num = strtol(group_tok, &end, 10);
+       if ((end != group_tok && *end != '\0') || errno != 0) {
+               RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", dev_addr);
+               return -1;
+       }
+
+       return 1;
+}
+
+static int
+type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
+               void *arg)
+{
+       int *vfio_container_fd = arg;
+
+       if (msl->external)
+               return 0;
+
+       return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
+                       ms->len, 1);
+}
+
+static int
+vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len, int do_map)
+{
+       struct vfio_iommu_type1_dma_map dma_map;
+       struct vfio_iommu_type1_dma_unmap dma_unmap;
+       int ret;
+
+       if (do_map != 0) {
+               memset(&dma_map, 0, sizeof(dma_map));
+               dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+               dma_map.vaddr = vaddr;
+               dma_map.size = len;
+               dma_map.iova = iova;
+               dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+                               VFIO_DMA_MAP_FLAG_WRITE;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+                               errno, strerror(errno));
+                               return -1;
+               }
+       } else {
+               memset(&dma_unmap, 0, sizeof(dma_unmap));
+               dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+               dma_unmap.size = len;
+               dma_unmap.iova = iova;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+                               &dma_unmap);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vfio_type1_dma_map(int vfio_container_fd)
+{
+       return rte_memseg_walk(type1_map, &vfio_container_fd);
+}
+
+static int
+vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len, int do_map)
+{
+       struct vfio_iommu_type1_dma_map dma_map;
+       struct vfio_iommu_type1_dma_unmap dma_unmap;
+       int ret;
+       struct vfio_iommu_spapr_register_memory reg = {
+               .argsz = sizeof(reg),
+               .flags = 0
+       };
+       reg.vaddr = (uintptr_t) vaddr;
+       reg.size = len;
+
+       if (do_map != 0) {
+               ret = ioctl(vfio_container_fd,
+                               VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
+                               "error %i (%s)\n", errno, strerror(errno));
+                       return -1;
+               }
+
+               memset(&dma_map, 0, sizeof(dma_map));
+               dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+               dma_map.vaddr = vaddr;
+               dma_map.size = len;
+               dma_map.iova = iova;
+               dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+                               VFIO_DMA_MAP_FLAG_WRITE;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
+                               errno, strerror(errno));
+                               return -1;
+               }
+
+       } else {
+               ret = ioctl(vfio_container_fd,
+                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
+
+               memset(&dma_unmap, 0, sizeof(dma_unmap));
+               dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
+               dma_unmap.size = len;
+               dma_unmap.iova = iova;
+
+               ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
+                               &dma_unmap);
+               if (ret) {
+                       RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static int
+vfio_spapr_map_walk(const struct rte_memseg_list *msl,
+               const struct rte_memseg *ms, void *arg)
+{
+       int *vfio_container_fd = arg;
+
+       if (msl->external)
+               return 0;
+
+       return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
+                       ms->len, 1);
+}
+
+struct spapr_walk_param {
+       uint64_t window_size;
+       uint64_t hugepage_sz;
+};
+static int
+vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
+               const struct rte_memseg *ms, void *arg)
+{
+       struct spapr_walk_param *param = arg;
+       uint64_t max = ms->iova + ms->len;
+
+       if (msl->external)
+               return 0;
+
+       if (max > param->window_size) {
+               param->hugepage_sz = ms->hugepage_sz;
+               param->window_size = max;
+       }
+
+       return 0;
+}
+
+static int
+vfio_spapr_create_new_dma_window(int vfio_container_fd,
+               struct vfio_iommu_spapr_tce_create *create) {
+       struct vfio_iommu_spapr_tce_remove remove = {
+               .argsz = sizeof(remove),
+       };
+       struct vfio_iommu_spapr_tce_info info = {
+               .argsz = sizeof(info),
+       };
+       int ret;
+
+       /* query spapr iommu info */
+       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  cannot get iommu info, "
+                               "error %i (%s)\n", errno, strerror(errno));
+               return -1;
+       }
+
+       /* remove default DMA of 32 bit window */
+       remove.start_addr = info.dma32_window_start;
+       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
+                               "error %i (%s)\n", errno, strerror(errno));
+               return -1;
+       }
+
+       /* create new DMA window */
+       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
+       if (ret) {
+               RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
+                               "error %i (%s)\n", errno, strerror(errno));
+               return -1;
+       }
+
+       if (create->start_addr != 0) {
+               RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len, int do_map)
+{
+       struct spapr_walk_param param;
+       struct vfio_iommu_spapr_tce_create create = {
+               .argsz = sizeof(create),
+       };
+       struct vfio_config *vfio_cfg;
+       struct user_mem_maps *user_mem_maps;
+       int i, ret = 0;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "  invalid container fd!\n");
+               return -1;
+       }
+
+       user_mem_maps = &vfio_cfg->mem_maps;
+       rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+       /* check if window size needs to be adjusted */
+       memset(&param, 0, sizeof(param));
+
+       /* we're inside a callback so use thread-unsafe version */
+       if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
+                               &param) < 0) {
+               RTE_LOG(ERR, EAL, "Could not get window size\n");
+               ret = -1;
+               goto out;
+       }
+
+       /* also check user maps */
+       for (i = 0; i < user_mem_maps->n_maps; i++) {
+               uint64_t max = user_mem_maps->maps[i].iova +
+                               user_mem_maps->maps[i].len;
+               create.window_size = RTE_MAX(create.window_size, max);
+       }
+
+       /* sPAPR requires window size to be a power of 2 */
+       create.window_size = rte_align64pow2(param.window_size);
+       create.page_shift = __builtin_ctzll(param.hugepage_sz);
+       create.levels = 1;
+
+       if (do_map) {
+               void *addr;
+               /* re-create window and remap the entire memory */
+               if (iova > create.window_size) {
+                       if (vfio_spapr_create_new_dma_window(vfio_container_fd,
+                                       &create) < 0) {
+                               RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+                               ret = -1;
+                               goto out;
+                       }
+                       /* we're inside a callback, so use thread-unsafe version
+                        */
+                       if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
+                                       &vfio_container_fd) < 0) {
+                               RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
+                               ret = -1;
+                               goto out;
+                       }
+                       /* remap all user maps */
+                       for (i = 0; i < user_mem_maps->n_maps; i++) {
+                               struct user_mem_map *map =
+                                               &user_mem_maps->maps[i];
+                               if (vfio_spapr_dma_do_map(vfio_container_fd,
+                                               map->addr, map->iova, map->len,
+                                               1)) {
+                                       RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
+                                       ret = -1;
+                                       goto out;
+                               }
+                       }
+               }
+
+               /* now that we've remapped all of the memory that was present
+                * before, map the segment that we were requested to map.
+                *
+                * however, if we were called by the callback, the memory we
+                * were called with was already in the memseg list, so previous
+                * mapping should've mapped that segment already.
+                *
+                * virt2memseg_list is a relatively cheap check, so use that. if
+                * memory is within any memseg list, it's a memseg, so it's
+                * already mapped.
+                */
+               addr = (void *)(uintptr_t)vaddr;
+               if (rte_mem_virt2memseg_list(addr) == NULL &&
+                               vfio_spapr_dma_do_map(vfio_container_fd,
+                                       vaddr, iova, len, 1) < 0) {
+                       RTE_LOG(ERR, EAL, "Could not map segment\n");
+                       ret = -1;
+                       goto out;
+               }
+       } else {
+               /* for unmap, check if iova within DMA window */
+               if (iova > create.window_size) {
+                       RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
+                       ret = -1;
+                       goto out;
+               }
+
+               vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
+       }
+out:
+       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+       return ret;
+}
+
+static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+       struct vfio_iommu_spapr_tce_create create = {
+               .argsz = sizeof(create),
+       };
+       struct spapr_walk_param param;
+
+       memset(&param, 0, sizeof(param));
+
+       /* create DMA window from 0 to max(phys_addr + len) */
+       rte_memseg_walk(vfio_spapr_window_size_walk, &param);
+
+       /* sPAPR requires window size to be a power of 2 */
+       create.window_size = rte_align64pow2(param.window_size);
+       create.page_shift = __builtin_ctzll(param.hugepage_sz);
+       create.levels = 1;
+
+       if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
+               RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
+               return -1;
+       }
+
+       /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+       if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
+               return -1;
+
+       return 0;
+}
+
+static int
+vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
+{
+       /* No-IOMMU mode does not need DMA mapping */
+       return 0;
+}
+
+static int
+vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
+                        uint64_t __rte_unused vaddr,
+                        uint64_t __rte_unused iova, uint64_t __rte_unused len,
+                        int __rte_unused do_map)
+{
+       /* No-IOMMU mode does not need DMA mapping */
+       return 0;
+}
+
+static int
+vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+               uint64_t len, int do_map)
+{
+       const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
+
+       if (!t) {
+               RTE_LOG(ERR, EAL, "  VFIO support not initialized\n");
+               rte_errno = ENODEV;
+               return -1;
+       }
+
+       if (!t->dma_user_map_func) {
+               RTE_LOG(ERR, EAL,
+                       "  VFIO custom DMA region maping not supported by IOMMU %s\n",
+                       t->name);
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
+                       len, do_map);
+}
+
+static int
+container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct user_mem_map *new_map;
+       struct user_mem_maps *user_mem_maps;
+       int ret = 0;
+
+       user_mem_maps = &vfio_cfg->mem_maps;
+       rte_spinlock_recursive_lock(&user_mem_maps->lock);
+       if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+               RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
+               rte_errno = ENOMEM;
+               ret = -1;
+               goto out;
+       }
+       /* map the entry */
+       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
+               /* technically, this will fail if there are currently no devices
+                * plugged in, even if a device were added later, this mapping
+                * might have succeeded. however, since we cannot verify if this
+                * is a valid mapping without having a device attached, consider
+                * this to be unsupported, because we can't just store any old
+                * mapping and pollute list of active mappings willy-nilly.
+                */
+               RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
+               ret = -1;
+               goto out;
+       }
+       /* create new user mem map entry */
+       new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+       new_map->addr = vaddr;
+       new_map->iova = iova;
+       new_map->len = len;
+
+       compact_user_maps(user_mem_maps);
+out:
+       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+       return ret;
+}
+
+static int
+container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct user_mem_map *map, *new_map = NULL;
+       struct user_mem_maps *user_mem_maps;
+       int ret = 0;
+
+       user_mem_maps = &vfio_cfg->mem_maps;
+       rte_spinlock_recursive_lock(&user_mem_maps->lock);
+
+       /* find our mapping */
+       map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
+       if (!map) {
+               RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
+               rte_errno = EINVAL;
+               ret = -1;
+               goto out;
+       }
+       if (map->addr != vaddr || map->iova != iova || map->len != len) {
+               /* we're partially unmapping a previously mapped region, so we
+                * need to split entry into two.
+                */
+               if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
+                       RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
+                       rte_errno = ENOMEM;
+                       ret = -1;
+                       goto out;
+               }
+               new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
+       }
+
+       /* unmap the entry */
+       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
+               /* there may not be any devices plugged in, so unmapping will
+                * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
+                * stop us from removing the mapping, as the assumption is we
+                * won't be needing this memory any more and thus will want to
+                * prevent it from being remapped again on hotplug. so, only
+                * fail if we indeed failed to unmap (e.g. if the mapping was
+                * within our mapped range but had invalid alignment).
+                */
+               if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
+                       RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
+                       ret = -1;
+                       goto out;
+               } else {
+                       RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
+               }
+       }
+       /* remove map from the list of active mappings */
+       if (new_map != NULL) {
+               adjust_map(map, new_map, vaddr, len);
+
+               /* if we've created a new map by splitting, sort everything */
+               if (!is_null_map(new_map)) {
+                       compact_user_maps(user_mem_maps);
+               } else {
+                       /* we've created a new mapping, but it was unused */
+                       user_mem_maps->n_maps--;
+               }
+       } else {
+               memset(map, 0, sizeof(*map));
+               compact_user_maps(user_mem_maps);
+               user_mem_maps->n_maps--;
+       }
+
+out:
+       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
+       return ret;
+}
+
+int
+rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return container_dma_map(default_vfio_cfg, vaddr, iova, len);
+}
+
+int
+rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
+{
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
+}
+
+int
+rte_vfio_noiommu_is_enabled(void)
+{
+       int fd;
+       ssize_t cnt;
+       char c;
+
+       fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
+       if (fd < 0) {
+               if (errno != ENOENT) {
+                       RTE_LOG(ERR, EAL, "  cannot open vfio noiommu file %i (%s)\n",
+                                       errno, strerror(errno));
+                       return -1;
+               }
+               /*
+                * else the file does not exists
+                * i.e. noiommu is not enabled
+                */
+               return 0;
+       }
+
+       cnt = read(fd, &c, 1);
+       close(fd);
+       if (cnt != 1) {
+               RTE_LOG(ERR, EAL, "  unable to read from vfio noiommu "
+                               "file %i (%s)\n", errno, strerror(errno));
+               return -1;
+       }
+
+       return c == 'Y';
+}
+
+int
+rte_vfio_container_create(void)
+{
+       int i;
+
+       /* Find an empty slot to store new vfio config */
+       for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
+               if (vfio_cfgs[i].vfio_container_fd == -1)
+                       break;
+       }
+
+       if (i == VFIO_MAX_CONTAINERS) {
+               RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
+               return -1;
+       }
+
+       vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
+       if (vfio_cfgs[i].vfio_container_fd < 0) {
+               RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
+               return -1;
+       }
+
+       return vfio_cfgs[i].vfio_container_fd;
+}
+
+int __rte_experimental
+rte_vfio_container_destroy(int container_fd)
+{
+       struct vfio_config *vfio_cfg;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       for (i = 0; i < VFIO_MAX_GROUPS; i++)
+               if (vfio_cfg->vfio_groups[i].group_num != -1)
+                       rte_vfio_container_group_unbind(container_fd,
+                               vfio_cfg->vfio_groups[i].group_num);
+
+       close(container_fd);
+       vfio_cfg->vfio_container_fd = -1;
+       vfio_cfg->vfio_active_groups = 0;
+       vfio_cfg->vfio_iommu_type = NULL;
+
+       return 0;
+}
+
+int
+rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       return vfio_get_group_fd(vfio_cfg, iommu_group_num);
+}
+
+int
+rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
+{
+       struct vfio_config *vfio_cfg;
+       struct vfio_group *cur_grp = NULL;
+       int i;
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       for (i = 0; i < VFIO_MAX_GROUPS; i++) {
+               if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
+                       cur_grp = &vfio_cfg->vfio_groups[i];
+                       break;
+               }
+       }
+
+       /* This should not happen */
+       if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
+               RTE_LOG(ERR, EAL, "Specified group number not found\n");
+               return -1;
+       }
+
+       if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
+               RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
+                       " iommu_group_num %d\n", iommu_group_num);
+               return -1;
+       }
+       cur_grp->group_num = -1;
+       cur_grp->fd = -1;
+       cur_grp->devices = 0;
+       vfio_cfg->vfio_active_groups--;
+
+       return 0;
+}
+
+int
+rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct vfio_config *vfio_cfg;
+
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       return container_dma_map(vfio_cfg, vaddr, iova, len);
+}
+
+int
+rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len)
+{
+       struct vfio_config *vfio_cfg;
+
+       if (len == 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
+       if (vfio_cfg == NULL) {
+               RTE_LOG(ERR, EAL, "Invalid container fd\n");
+               return -1;
+       }
+
+       return container_dma_unmap(vfio_cfg, vaddr, iova, len);
+}
+
+#else
+
+int
+rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
+                 __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+int
+rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
+                   __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+int
+rte_vfio_setup_device(__rte_unused const char *sysfs_base,
+               __rte_unused const char *dev_addr,
+               __rte_unused int *vfio_dev_fd,
+               __rte_unused struct vfio_device_info *device_info)
+{
+       return -1;
+}
+
+int
+rte_vfio_release_device(__rte_unused const char *sysfs_base,
+               __rte_unused const char *dev_addr, __rte_unused int fd)
+{
+       return -1;
+}
+
+int
+rte_vfio_enable(__rte_unused const char *modname)
+{
+       return -1;
+}
+
+int
+rte_vfio_is_enabled(__rte_unused const char *modname)
+{
+       return -1;
+}
+
+int
+rte_vfio_noiommu_is_enabled(void)
+{
+       return -1;
+}
+
+int
+rte_vfio_clear_group(__rte_unused int vfio_group_fd)
+{
+       return -1;
+}
+
+int
+rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
+               __rte_unused const char *dev_addr,
+               __rte_unused int *iommu_group_num)
+{
+       return -1;
+}
+
+int
+rte_vfio_get_container_fd(void)
+{
+       return -1;
+}
+
+int
+rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_create(void)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_destroy(__rte_unused int container_fd)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_group_bind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_group_unbind(__rte_unused int container_fd,
+               __rte_unused int iommu_group_num)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_dma_map(__rte_unused int container_fd,
+               __rte_unused uint64_t vaddr,
+               __rte_unused uint64_t iova,
+               __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+int
+rte_vfio_container_dma_unmap(__rte_unused int container_fd,
+               __rte_unused uint64_t vaddr,
+               __rte_unused uint64_t iova,
+               __rte_unused uint64_t len)
+{
+       return -1;
+}
+
+#endif /* VFIO_PRESENT */
diff --git a/lib/librte_eal/linux/eal/eal_vfio.h b/lib/librte_eal/linux/eal/eal_vfio.h
new file mode 100644 (file)
index 0000000..cb2d35f
--- /dev/null
@@ -0,0 +1,158 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#ifndef EAL_VFIO_H_
+#define EAL_VFIO_H_
+
+#include <rte_common.h>
+
+/*
+ * determine if VFIO is present on the system
+ */
+#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO)
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
+#define VFIO_PRESENT
+#else
+#pragma message("VFIO configured but not supported by this kernel, disabling.")
+#endif /* kernel version >= 3.6.0 */
+#endif /* RTE_EAL_VFIO */
+
+#ifdef VFIO_PRESENT
+
+#include <stdint.h>
+#include <linux/vfio.h>
+
+#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+
+#ifndef VFIO_SPAPR_TCE_v2_IOMMU
+#define RTE_VFIO_SPAPR 7
+#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
+#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
+#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
+#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+struct vfio_iommu_spapr_register_memory {
+       uint32_t argsz;
+       uint32_t flags;
+       uint64_t vaddr;
+       uint64_t size;
+};
+
+struct vfio_iommu_spapr_tce_create {
+       uint32_t argsz;
+       uint32_t flags;
+       /* in */
+       uint32_t page_shift;
+       uint32_t __resv1;
+       uint64_t window_size;
+       uint32_t levels;
+       uint32_t __resv2;
+       /* out */
+       uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_remove {
+       uint32_t argsz;
+       uint32_t flags;
+       /* in */
+       uint64_t start_addr;
+};
+
+struct vfio_iommu_spapr_tce_ddw_info {
+       uint64_t pgsizes;
+       uint32_t max_dynamic_windows_supported;
+       uint32_t levels;
+};
+
+/* SPAPR_v2 is not present, but SPAPR might be */
+#ifndef VFIO_SPAPR_TCE_IOMMU
+#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+struct vfio_iommu_spapr_tce_info {
+       uint32_t argsz;
+       uint32_t flags;
+       uint32_t dma32_window_start;
+       uint32_t dma32_window_size;
+       struct vfio_iommu_spapr_tce_ddw_info ddw;
+};
+#endif /* VFIO_SPAPR_TCE_IOMMU */
+
+#else /* VFIO_SPAPR_TCE_v2_IOMMU */
+#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
+#endif
+
+#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
+#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
+
+/*
+ * we don't need to store device fd's anywhere since they can be obtained from
+ * the group fd via an ioctl() call.
+ */
+struct vfio_group {
+       int group_num;
+       int fd;
+       int devices;
+};
+
+/* DMA mapping function prototype.
+ * Takes VFIO container fd as a parameter.
+ * Returns 0 on success, -1 on error.
+ * */
+typedef int (*vfio_dma_func_t)(int);
+
+/* Custom memory region DMA mapping function prototype.
+ * Takes VFIO container fd, virtual address, phisical address, length and
+ * operation type (0 to unmap 1 for map) as a parameters.
+ * Returns 0 on success, -1 on error.
+ **/
+typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
+               uint64_t len, int do_map);
+
+struct vfio_iommu_type {
+       int type_id;
+       const char *name;
+       vfio_dma_user_func_t dma_user_map_func;
+       vfio_dma_func_t dma_map_func;
+};
+
+/* get the vfio container that devices are bound to by default */
+int vfio_get_default_container_fd(void);
+
+/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
+const struct vfio_iommu_type *
+vfio_set_iommu_type(int vfio_container_fd);
+
+int
+vfio_get_iommu_type(void);
+
+/* check if we have any supported extensions */
+int
+vfio_has_supported_extensions(int vfio_container_fd);
+
+int vfio_mp_sync_setup(void);
+
+#define EAL_VFIO_MP "eal_vfio_mp_sync"
+
+#define SOCKET_REQ_CONTAINER 0x100
+#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
+#define SOCKET_REQ_IOMMU_TYPE 0x800
+#define SOCKET_OK 0x0
+#define SOCKET_NO_FD 0x1
+#define SOCKET_ERR 0xFF
+
+struct vfio_mp_param {
+       int req;
+       int result;
+       RTE_STD_C11
+       union {
+               int group_num;
+               int iommu_type_id;
+       };
+};
+
+#endif /* VFIO_PRESENT */
+
+#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linux/eal/eal_vfio_mp_sync.c
new file mode 100644 (file)
index 0000000..2a47f29
--- /dev/null
@@ -0,0 +1,119 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2018 Intel Corporation
+ */
+
+#include <unistd.h>
+#include <string.h>
+
+#include <rte_compat.h>
+#include <rte_log.h>
+#include <rte_vfio.h>
+#include <rte_eal.h>
+
+#include "eal_vfio.h"
+
+/**
+ * @file
+ * VFIO socket for communication between primary and secondary processes.
+ *
+ * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
+ */
+
+#ifdef VFIO_PRESENT
+
+static int
+vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
+{
+       int fd = -1;
+       int ret;
+       struct rte_mp_msg reply;
+       struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
+       const struct vfio_mp_param *m =
+               (const struct vfio_mp_param *)msg->param;
+
+       if (msg->len_param != sizeof(*m)) {
+               RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+               return -1;
+       }
+
+       memset(&reply, 0, sizeof(reply));
+
+       switch (m->req) {
+       case SOCKET_REQ_GROUP:
+               r->req = SOCKET_REQ_GROUP;
+               r->group_num = m->group_num;
+               fd = rte_vfio_get_group_fd(m->group_num);
+               if (fd < 0)
+                       r->result = SOCKET_ERR;
+               else if (fd == 0)
+                       /* if VFIO group exists but isn't bound to VFIO driver */
+                       r->result = SOCKET_NO_FD;
+               else {
+                       /* if group exists and is bound to VFIO driver */
+                       r->result = SOCKET_OK;
+                       reply.num_fds = 1;
+                       reply.fds[0] = fd;
+               }
+               break;
+       case SOCKET_REQ_CONTAINER:
+               r->req = SOCKET_REQ_CONTAINER;
+               fd = rte_vfio_get_container_fd();
+               if (fd < 0)
+                       r->result = SOCKET_ERR;
+               else {
+                       r->result = SOCKET_OK;
+                       reply.num_fds = 1;
+                       reply.fds[0] = fd;
+               }
+               break;
+       case SOCKET_REQ_DEFAULT_CONTAINER:
+               r->req = SOCKET_REQ_DEFAULT_CONTAINER;
+               fd = vfio_get_default_container_fd();
+               if (fd < 0)
+                       r->result = SOCKET_ERR;
+               else {
+                       r->result = SOCKET_OK;
+                       reply.num_fds = 1;
+                       reply.fds[0] = fd;
+               }
+               break;
+       case SOCKET_REQ_IOMMU_TYPE:
+       {
+               int iommu_type_id;
+
+               r->req = SOCKET_REQ_IOMMU_TYPE;
+
+               iommu_type_id = vfio_get_iommu_type();
+
+               if (iommu_type_id < 0)
+                       r->result = SOCKET_ERR;
+               else {
+                       r->iommu_type_id = iommu_type_id;
+                       r->result = SOCKET_OK;
+               }
+               break;
+       }
+       default:
+               RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
+               return -1;
+       }
+
+       strcpy(reply.name, EAL_VFIO_MP);
+       reply.len_param = sizeof(*r);
+
+       ret = rte_mp_reply(&reply, peer);
+       if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
+               close(fd);
+       return ret;
+}
+
+int
+vfio_mp_sync_setup(void)
+{
+       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+               return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
+
+       return 0;
+}
+
+#endif
diff --git a/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linux/eal/include/exec-env/rte_kni_common.h
new file mode 100644 (file)
index 0000000..5afa087
--- /dev/null
@@ -0,0 +1,139 @@
+/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */
+/*
+ * Copyright(c) 2007-2014 Intel Corporation.
+ */
+
+#ifndef _RTE_KNI_COMMON_H_
+#define _RTE_KNI_COMMON_H_
+
+#ifdef __KERNEL__
+#include <linux/if.h>
+#include <asm/barrier.h>
+#define RTE_STD_C11
+#else
+#include <rte_common.h>
+#include <rte_config.h>
+#endif
+
+/**
+ * KNI name is part of memzone name.
+ */
+#define RTE_KNI_NAMESIZE 32
+
+#define RTE_CACHE_LINE_MIN_SIZE 64
+
+/*
+ * Request id.
+ */
+enum rte_kni_req_id {
+       RTE_KNI_REQ_UNKNOWN = 0,
+       RTE_KNI_REQ_CHANGE_MTU,
+       RTE_KNI_REQ_CFG_NETWORK_IF,
+       RTE_KNI_REQ_CHANGE_MAC_ADDR,
+       RTE_KNI_REQ_CHANGE_PROMISC,
+       RTE_KNI_REQ_MAX,
+};
+
+/*
+ * Structure for KNI request.
+ */
+struct rte_kni_request {
+       uint32_t req_id;             /**< Request id */
+       RTE_STD_C11
+       union {
+               uint32_t new_mtu;    /**< New MTU */
+               uint8_t if_up;       /**< 1: interface up, 0: interface down */
+               uint8_t mac_addr[6]; /**< MAC address for interface */
+               uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */
+       };
+       int32_t result;               /**< Result for processing request */
+} __attribute__((__packed__));
+
+/*
+ * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
+ * Write and read should wrap around. Fifo is empty when write == read
+ * Writing should never overwrite the read position
+ */
+struct rte_kni_fifo {
+#ifdef RTE_USE_C11_MEM_MODEL
+       unsigned write;              /**< Next position to be written*/
+       unsigned read;               /**< Next position to be read */
+#else
+       volatile unsigned write;     /**< Next position to be written*/
+       volatile unsigned read;      /**< Next position to be read */
+#endif
+       unsigned len;                /**< Circular buffer length */
+       unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
+       void *volatile buffer[];     /**< The buffer contains mbuf pointers */
+};
+
+/*
+ * The kernel image of the rte_mbuf struct, with only the relevant fields.
+ * Padding is necessary to assure the offsets of these fields
+ */
+struct rte_kni_mbuf {
+       void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
+       uint64_t buf_physaddr;
+       uint16_t data_off;      /**< Start address of data in segment buffer. */
+       char pad1[2];
+       uint16_t nb_segs;       /**< Number of segments. */
+       char pad4[2];
+       uint64_t ol_flags;      /**< Offload features. */
+       char pad2[4];
+       uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
+       uint16_t data_len;      /**< Amount of data in segment buffer. */
+
+       /* fields on second cache line */
+       char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)));
+       void *pool;
+       void *next;
+};
+
+/*
+ * Struct used to create a KNI device. Passed to the kernel in IOCTL call
+ */
+
+struct rte_kni_device_info {
+       char name[RTE_KNI_NAMESIZE];  /**< Network device name for KNI */
+
+       phys_addr_t tx_phys;
+       phys_addr_t rx_phys;
+       phys_addr_t alloc_phys;
+       phys_addr_t free_phys;
+
+       /* Used by Ethtool */
+       phys_addr_t req_phys;
+       phys_addr_t resp_phys;
+       phys_addr_t sync_phys;
+       void * sync_va;
+
+       /* mbuf mempool */
+       void * mbuf_va;
+       phys_addr_t mbuf_phys;
+
+       /* PCI info */
+       uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
+       uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
+       uint8_t bus;                  /**< Device bus */
+       uint8_t devid;                /**< Device ID */
+       uint8_t function;             /**< Device function. */
+
+       uint16_t group_id;            /**< Group ID */
+       uint32_t core_id;             /**< core ID to bind for kernel thread */
+
+       __extension__
+       uint8_t force_bind : 1;       /**< Flag for kernel thread binding */
+
+       /* mbuf size */
+       unsigned mbuf_size;
+       unsigned int mtu;
+       char mac_addr[6];
+};
+
+#define KNI_DEVICE "kni"
+
+#define RTE_KNI_IOCTL_TEST    _IOWR(0, 1, int)
+#define RTE_KNI_IOCTL_CREATE  _IOWR(0, 2, struct rte_kni_device_info)
+#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info)
+
+#endif /* _RTE_KNI_COMMON_H_ */
diff --git a/lib/librte_eal/linux/eal/meson.build b/lib/librte_eal/linux/eal/meson.build
new file mode 100644 (file)
index 0000000..7e68b2c
--- /dev/null
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+eal_inc += include_directories('include')
+install_subdir('include/exec-env', install_dir: get_option('includedir'))
+
+env_objs = []
+env_headers = []
+env_sources = files('eal_alarm.c',
+               'eal_cpuflags.c',
+               'eal_debug.c',
+               'eal_hugepage_info.c',
+               'eal_interrupts.c',
+               'eal_memalloc.c',
+               'eal_lcore.c',
+               'eal_log.c',
+               'eal_thread.c',
+               'eal_timer.c',
+               'eal_vfio.c',
+               'eal_vfio_mp_sync.c',
+               'eal.c',
+               'eal_memory.c',
+               'eal_dev.c',
+)
+
+deps += ['kvargs']
+if has_libnuma == 1
+       dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)
+endif
diff --git a/lib/librte_eal/linuxapp/Makefile b/lib/librte_eal/linuxapp/Makefile
deleted file mode 100644 (file)
index a0fffa9..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2014 Intel Corporation
-
-include $(RTE_SDK)/mk/rte.vars.mk
-
-DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal
-DEPDIRS-kni := eal
-
-CFLAGS += -DALLOW_EXPERIMENTAL_API
-
-include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
deleted file mode 100644 (file)
index 51deb57..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2010-2016 Intel Corporation
-
-include $(RTE_SDK)/mk/rte.vars.mk
-
-LIB = librte_eal.a
-
-ARCH_DIR ?= $(RTE_ARCH)
-
-EXPORT_MAP := ../../rte_eal_version.map
-VPATH += $(RTE_SDK)/lib/librte_eal/common/arch/$(ARCH_DIR)
-
-LIBABIVER := 9
-
-VPATH += $(RTE_SDK)/lib/librte_eal/common
-
-CFLAGS += -DALLOW_EXPERIMENTAL_API
-CFLAGS += -I$(SRCDIR)/include
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common
-CFLAGS += -I$(RTE_SDK)/lib/librte_eal/common/include
-CFLAGS += $(WERROR_FLAGS) -O3
-
-LDLIBS += -ldl
-LDLIBS += -lpthread
-LDLIBS += -lgcc_s
-LDLIBS += -lrt
-LDLIBS += -lrte_kvargs
-ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
-LDLIBS += -lnuma
-endif
-
-# specific to linuxapp exec-env
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_hugepage_info.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
-
-# from common dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_timer.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memzone.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_log.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_launch.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memalloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_memory.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_tailqs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_errno.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hypervisor.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_string_fns.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_hexdump.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_devargs.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_class.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_bus.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_dev.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_options.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_thread.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_proc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_uuid.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += hotplug_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_option.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
-
-# from arch dir
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_cpuflags.c
-SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_hypervisor.c
-SRCS-$(CONFIG_RTE_ARCH_X86) += rte_spinlock.c
-SRCS-y += rte_cycles.c
-
-CFLAGS_eal_common_cpuflags.o := $(CPUFLAGS_LIST)
-
-# workaround for a gcc bug with noreturn attribute
-# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
-ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
-CFLAGS_eal_thread.o += -Wno-return-type
-endif
-
-INC := rte_kni_common.h
-
-SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
-       $(addprefix include/exec-env/,$(INC))
-
-include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
deleted file mode 100644 (file)
index 13f4016..0000000
+++ /dev/null
@@ -1,1336 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation.
- * Copyright(c) 2012-2014 6WIND S.A.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <syslog.h>
-#include <getopt.h>
-#include <sys/file.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <fnmatch.h>
-#include <stddef.h>
-#include <errno.h>
-#include <limits.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <sys/stat.h>
-#if defined(RTE_ARCH_X86)
-#include <sys/io.h>
-#endif
-
-#include <rte_compat.h>
-#include <rte_common.h>
-#include <rte_debug.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_eal_memconfig.h>
-#include <rte_errno.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_service_component.h>
-#include <rte_log.h>
-#include <rte_random.h>
-#include <rte_cycles.h>
-#include <rte_string_fns.h>
-#include <rte_cpuflags.h>
-#include <rte_interrupts.h>
-#include <rte_bus.h>
-#include <rte_dev.h>
-#include <rte_devargs.h>
-#include <rte_version.h>
-#include <rte_atomic.h>
-#include <malloc_heap.h>
-#include <rte_vfio.h>
-#include <rte_option.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_hugepages.h"
-#include "eal_options.h"
-#include "eal_vfio.h"
-
-#define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL)
-
-#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10)
-
-/* Allow the application to print its usage message too if set */
-static rte_usage_hook_t        rte_application_usage_hook = NULL;
-
-/* early configuration structure, when memory config is not mmapped */
-static struct rte_mem_config early_mem_config;
-
-/* define fd variable here, because file needs to be kept open for the
- * duration of the program, as we hold a write lock on it in the primary proc */
-static int mem_cfg_fd = -1;
-
-static struct flock wr_lock = {
-               .l_type = F_WRLCK,
-               .l_whence = SEEK_SET,
-               .l_start = offsetof(struct rte_mem_config, memsegs),
-               .l_len = sizeof(early_mem_config.memsegs),
-};
-
-/* Address of global and public configuration */
-static struct rte_config rte_config = {
-               .mem_config = &early_mem_config,
-};
-
-/* internal configuration (per-core) */
-struct lcore_config lcore_config[RTE_MAX_LCORE];
-
-/* internal configuration */
-struct internal_config internal_config;
-
-/* used by rte_rdtsc() */
-int rte_cycles_vmware_tsc_map;
-
-/* platform-specific runtime dir */
-static char runtime_dir[PATH_MAX];
-
-static const char *default_runtime_dir = "/var/run";
-
-int
-eal_create_runtime_dir(void)
-{
-       const char *directory = default_runtime_dir;
-       const char *xdg_runtime_dir = getenv("XDG_RUNTIME_DIR");
-       const char *fallback = "/tmp";
-       char tmp[PATH_MAX];
-       int ret;
-
-       if (getuid() != 0) {
-               /* try XDG path first, fall back to /tmp */
-               if (xdg_runtime_dir != NULL)
-                       directory = xdg_runtime_dir;
-               else
-                       directory = fallback;
-       }
-       /* create DPDK subdirectory under runtime dir */
-       ret = snprintf(tmp, sizeof(tmp), "%s/dpdk", directory);
-       if (ret < 0 || ret == sizeof(tmp)) {
-               RTE_LOG(ERR, EAL, "Error creating DPDK runtime path name\n");
-               return -1;
-       }
-
-       /* create prefix-specific subdirectory under DPDK runtime dir */
-       ret = snprintf(runtime_dir, sizeof(runtime_dir), "%s/%s",
-                       tmp, eal_get_hugefile_prefix());
-       if (ret < 0 || ret == sizeof(runtime_dir)) {
-               RTE_LOG(ERR, EAL, "Error creating prefix-specific runtime path name\n");
-               return -1;
-       }
-
-       /* create the path if it doesn't exist. no "mkdir -p" here, so do it
-        * step by step.
-        */
-       ret = mkdir(tmp, 0700);
-       if (ret < 0 && errno != EEXIST) {
-               RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
-                       tmp, strerror(errno));
-               return -1;
-       }
-
-       ret = mkdir(runtime_dir, 0700);
-       if (ret < 0 && errno != EEXIST) {
-               RTE_LOG(ERR, EAL, "Error creating '%s': %s\n",
-                       runtime_dir, strerror(errno));
-               return -1;
-       }
-
-       return 0;
-}
-
-int
-eal_clean_runtime_dir(void)
-{
-       DIR *dir;
-       struct dirent *dirent;
-       int dir_fd, fd, lck_result;
-       static const char * const filters[] = {
-               "fbarray_*",
-               "mp_socket_*"
-       };
-
-       /* open directory */
-       dir = opendir(runtime_dir);
-       if (!dir) {
-               RTE_LOG(ERR, EAL, "Unable to open runtime directory %s\n",
-                               runtime_dir);
-               goto error;
-       }
-       dir_fd = dirfd(dir);
-
-       /* lock the directory before doing anything, to avoid races */
-       if (flock(dir_fd, LOCK_EX) < 0) {
-               RTE_LOG(ERR, EAL, "Unable to lock runtime directory %s\n",
-                       runtime_dir);
-               goto error;
-       }
-
-       dirent = readdir(dir);
-       if (!dirent) {
-               RTE_LOG(ERR, EAL, "Unable to read runtime directory %s\n",
-                               runtime_dir);
-               goto error;
-       }
-
-       while (dirent != NULL) {
-               unsigned int f_idx;
-               bool skip = true;
-
-               /* skip files that don't match the patterns */
-               for (f_idx = 0; f_idx < RTE_DIM(filters); f_idx++) {
-                       const char *filter = filters[f_idx];
-
-                       if (fnmatch(filter, dirent->d_name, 0) == 0) {
-                               skip = false;
-                               break;
-                       }
-               }
-               if (skip) {
-                       dirent = readdir(dir);
-                       continue;
-               }
-
-               /* try and lock the file */
-               fd = openat(dir_fd, dirent->d_name, O_RDONLY);
-
-               /* skip to next file */
-               if (fd == -1) {
-                       dirent = readdir(dir);
-                       continue;
-               }
-
-               /* non-blocking lock */
-               lck_result = flock(fd, LOCK_EX | LOCK_NB);
-
-               /* if lock succeeds, remove the file */
-               if (lck_result != -1)
-                       unlinkat(dir_fd, dirent->d_name, 0);
-               close(fd);
-               dirent = readdir(dir);
-       }
-
-       /* closedir closes dir_fd and drops the lock */
-       closedir(dir);
-       return 0;
-
-error:
-       if (dir)
-               closedir(dir);
-
-       RTE_LOG(ERR, EAL, "Error while clearing runtime dir: %s\n",
-               strerror(errno));
-
-       return -1;
-}
-
-const char *
-rte_eal_get_runtime_dir(void)
-{
-       return runtime_dir;
-}
-
-/* Return user provided mbuf pool ops name */
-const char *
-rte_eal_mbuf_user_pool_ops(void)
-{
-       return internal_config.user_mbuf_pool_ops_name;
-}
-
-/* Return a pointer to the configuration structure */
-struct rte_config *
-rte_eal_get_configuration(void)
-{
-       return &rte_config;
-}
-
-enum rte_iova_mode
-rte_eal_iova_mode(void)
-{
-       return rte_eal_get_configuration()->iova_mode;
-}
-
-/* parse a sysfs (or other) file containing one integer value */
-int
-eal_parse_sysfs_value(const char *filename, unsigned long *val)
-{
-       FILE *f;
-       char buf[BUFSIZ];
-       char *end = NULL;
-
-       if ((f = fopen(filename, "r")) == NULL) {
-               RTE_LOG(ERR, EAL, "%s(): cannot open sysfs value %s\n",
-                       __func__, filename);
-               return -1;
-       }
-
-       if (fgets(buf, sizeof(buf), f) == NULL) {
-               RTE_LOG(ERR, EAL, "%s(): cannot read sysfs value %s\n",
-                       __func__, filename);
-               fclose(f);
-               return -1;
-       }
-       *val = strtoul(buf, &end, 0);
-       if ((buf[0] == '\0') || (end == NULL) || (*end != '\n')) {
-               RTE_LOG(ERR, EAL, "%s(): cannot parse sysfs value %s\n",
-                               __func__, filename);
-               fclose(f);
-               return -1;
-       }
-       fclose(f);
-       return 0;
-}
-
-
-/* create memory configuration in shared/mmap memory. Take out
- * a write lock on the memsegs, so we can auto-detect primary/secondary.
- * This means we never close the file while running (auto-close on exit).
- * We also don't lock the whole file, so that in future we can use read-locks
- * on other parts, e.g. memzones, to detect if there are running secondary
- * processes. */
-static void
-rte_eal_config_create(void)
-{
-       void *rte_mem_cfg_addr;
-       int retval;
-
-       const char *pathname = eal_runtime_config_path();
-
-       if (internal_config.no_shconf)
-               return;
-
-       /* map the config before hugepage address so that we don't waste a page */
-       if (internal_config.base_virtaddr != 0)
-               rte_mem_cfg_addr = (void *)
-                       RTE_ALIGN_FLOOR(internal_config.base_virtaddr -
-                       sizeof(struct rte_mem_config), sysconf(_SC_PAGE_SIZE));
-       else
-               rte_mem_cfg_addr = NULL;
-
-       if (mem_cfg_fd < 0){
-               mem_cfg_fd = open(pathname, O_RDWR | O_CREAT, 0660);
-               if (mem_cfg_fd < 0)
-                       rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
-       }
-
-       retval = ftruncate(mem_cfg_fd, sizeof(*rte_config.mem_config));
-       if (retval < 0){
-               close(mem_cfg_fd);
-               rte_panic("Cannot resize '%s' for rte_mem_config\n", pathname);
-       }
-
-       retval = fcntl(mem_cfg_fd, F_SETLK, &wr_lock);
-       if (retval < 0){
-               close(mem_cfg_fd);
-               rte_exit(EXIT_FAILURE, "Cannot create lock on '%s'. Is another primary "
-                               "process running?\n", pathname);
-       }
-
-       rte_mem_cfg_addr = mmap(rte_mem_cfg_addr, sizeof(*rte_config.mem_config),
-                               PROT_READ | PROT_WRITE, MAP_SHARED, mem_cfg_fd, 0);
-
-       if (rte_mem_cfg_addr == MAP_FAILED){
-               rte_panic("Cannot mmap memory for rte_config\n");
-       }
-       memcpy(rte_mem_cfg_addr, &early_mem_config, sizeof(early_mem_config));
-       rte_config.mem_config = rte_mem_cfg_addr;
-
-       /* store address of the config in the config itself so that secondary
-        * processes could later map the config into this exact location */
-       rte_config.mem_config->mem_cfg_addr = (uintptr_t) rte_mem_cfg_addr;
-
-       rte_config.mem_config->dma_maskbits = 0;
-
-}
-
-/* attach to an existing shared memory config */
-static void
-rte_eal_config_attach(void)
-{
-       struct rte_mem_config *mem_config;
-
-       const char *pathname = eal_runtime_config_path();
-
-       if (internal_config.no_shconf)
-               return;
-
-       if (mem_cfg_fd < 0){
-               mem_cfg_fd = open(pathname, O_RDWR);
-               if (mem_cfg_fd < 0)
-                       rte_panic("Cannot open '%s' for rte_mem_config\n", pathname);
-       }
-
-       /* map it as read-only first */
-       mem_config = (struct rte_mem_config *) mmap(NULL, sizeof(*mem_config),
-                       PROT_READ, MAP_SHARED, mem_cfg_fd, 0);
-       if (mem_config == MAP_FAILED)
-               rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-                         errno, strerror(errno));
-
-       rte_config.mem_config = mem_config;
-}
-
-/* reattach the shared config at exact memory location primary process has it */
-static void
-rte_eal_config_reattach(void)
-{
-       struct rte_mem_config *mem_config;
-       void *rte_mem_cfg_addr;
-
-       if (internal_config.no_shconf)
-               return;
-
-       /* save the address primary process has mapped shared config to */
-       rte_mem_cfg_addr = (void *) (uintptr_t) rte_config.mem_config->mem_cfg_addr;
-
-       /* unmap original config */
-       munmap(rte_config.mem_config, sizeof(struct rte_mem_config));
-
-       /* remap the config at proper address */
-       mem_config = (struct rte_mem_config *) mmap(rte_mem_cfg_addr,
-                       sizeof(*mem_config), PROT_READ | PROT_WRITE, MAP_SHARED,
-                       mem_cfg_fd, 0);
-       if (mem_config == MAP_FAILED || mem_config != rte_mem_cfg_addr) {
-               if (mem_config != MAP_FAILED)
-                       /* errno is stale, don't use */
-                       rte_panic("Cannot mmap memory for rte_config at [%p], got [%p]"
-                                 " - please use '--base-virtaddr' option\n",
-                                 rte_mem_cfg_addr, mem_config);
-               else
-                       rte_panic("Cannot mmap memory for rte_config! error %i (%s)\n",
-                                 errno, strerror(errno));
-       }
-       close(mem_cfg_fd);
-
-       rte_config.mem_config = mem_config;
-}
-
-/* Detect if we are a primary or a secondary process */
-enum rte_proc_type_t
-eal_proc_type_detect(void)
-{
-       enum rte_proc_type_t ptype = RTE_PROC_PRIMARY;
-       const char *pathname = eal_runtime_config_path();
-
-       /* if there no shared config, there can be no secondary processes */
-       if (!internal_config.no_shconf) {
-               /* if we can open the file but not get a write-lock we are a
-                * secondary process. NOTE: if we get a file handle back, we
-                * keep that open and don't close it to prevent a race condition
-                * between multiple opens.
-                */
-               if (((mem_cfg_fd = open(pathname, O_RDWR)) >= 0) &&
-                               (fcntl(mem_cfg_fd, F_SETLK, &wr_lock) < 0))
-                       ptype = RTE_PROC_SECONDARY;
-       }
-
-       RTE_LOG(INFO, EAL, "Auto-detected process type: %s\n",
-                       ptype == RTE_PROC_PRIMARY ? "PRIMARY" : "SECONDARY");
-
-       return ptype;
-}
-
-/* copies data from internal config to shared config */
-static void
-eal_update_mem_config(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       mcfg->legacy_mem = internal_config.legacy_mem;
-       mcfg->single_file_segments = internal_config.single_file_segments;
-}
-
-/* copies data from shared config to internal config */
-static void
-eal_update_internal_config(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       internal_config.legacy_mem = mcfg->legacy_mem;
-       internal_config.single_file_segments = mcfg->single_file_segments;
-}
-
-/* Sets up rte_config structure with the pointer to shared memory config.*/
-static void
-rte_config_init(void)
-{
-       rte_config.process_type = internal_config.process_type;
-
-       switch (rte_config.process_type){
-       case RTE_PROC_PRIMARY:
-               rte_eal_config_create();
-               eal_update_mem_config();
-               break;
-       case RTE_PROC_SECONDARY:
-               rte_eal_config_attach();
-               rte_eal_mcfg_wait_complete(rte_config.mem_config);
-               rte_eal_config_reattach();
-               eal_update_internal_config();
-               break;
-       case RTE_PROC_AUTO:
-       case RTE_PROC_INVALID:
-               rte_panic("Invalid process type\n");
-       }
-}
-
-/* Unlocks hugepage directories that were locked by eal_hugepage_info_init */
-static void
-eal_hugedirs_unlock(void)
-{
-       int i;
-
-       for (i = 0; i < MAX_HUGEPAGE_SIZES; i++)
-       {
-               /* skip uninitialized */
-               if (internal_config.hugepage_info[i].lock_descriptor < 0)
-                       continue;
-               /* unlock hugepage file */
-               flock(internal_config.hugepage_info[i].lock_descriptor, LOCK_UN);
-               close(internal_config.hugepage_info[i].lock_descriptor);
-               /* reset the field */
-               internal_config.hugepage_info[i].lock_descriptor = -1;
-       }
-}
-
-/* display usage */
-static void
-eal_usage(const char *prgname)
-{
-       printf("\nUsage: %s ", prgname);
-       eal_common_usage();
-       printf("EAL Linux options:\n"
-              "  --"OPT_SOCKET_MEM"        Memory to allocate on sockets (comma separated values)\n"
-              "  --"OPT_SOCKET_LIMIT"      Limit memory allocation on sockets (comma separated values)\n"
-              "  --"OPT_HUGE_DIR"          Directory where hugetlbfs is mounted\n"
-              "  --"OPT_FILE_PREFIX"       Prefix for hugepage filenames\n"
-              "  --"OPT_BASE_VIRTADDR"     Base virtual address\n"
-              "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by hotplug)\n"
-              "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO (legacy|msi|msix)\n"
-              "  --"OPT_LEGACY_MEM"        Legacy memory mode (no dynamic allocation, contiguous segments)\n"
-              "  --"OPT_SINGLE_FILE_SEGMENTS" Put all hugepage memory in single files\n"
-              "  --"OPT_MATCH_ALLOCATIONS" Free hugepages exactly as allocated\n"
-              "\n");
-       /* Allow the application to print its usage message too if hook is set */
-       if ( rte_application_usage_hook ) {
-               printf("===== Application Usage =====\n\n");
-               rte_application_usage_hook(prgname);
-       }
-}
-
-/* Set a per-application usage message */
-rte_usage_hook_t
-rte_set_application_usage_hook( rte_usage_hook_t usage_func )
-{
-       rte_usage_hook_t        old_func;
-
-       /* Will be NULL on the first call to denote the last usage routine. */
-       old_func                                        = rte_application_usage_hook;
-       rte_application_usage_hook      = usage_func;
-
-       return old_func;
-}
-
-static int
-eal_parse_socket_arg(char *strval, volatile uint64_t *socket_arg)
-{
-       char * arg[RTE_MAX_NUMA_NODES];
-       char *end;
-       int arg_num, i, len;
-       uint64_t total_mem = 0;
-
-       len = strnlen(strval, SOCKET_MEM_STRLEN);
-       if (len == SOCKET_MEM_STRLEN) {
-               RTE_LOG(ERR, EAL, "--socket-mem is too long\n");
-               return -1;
-       }
-
-       /* all other error cases will be caught later */
-       if (!isdigit(strval[len-1]))
-               return -1;
-
-       /* split the optarg into separate socket values */
-       arg_num = rte_strsplit(strval, len,
-                       arg, RTE_MAX_NUMA_NODES, ',');
-
-       /* if split failed, or 0 arguments */
-       if (arg_num <= 0)
-               return -1;
-
-       /* parse each defined socket option */
-       errno = 0;
-       for (i = 0; i < arg_num; i++) {
-               uint64_t val;
-               end = NULL;
-               val = strtoull(arg[i], &end, 10);
-
-               /* check for invalid input */
-               if ((errno != 0)  ||
-                               (arg[i][0] == '\0') || (end == NULL) || (*end != '\0'))
-                       return -1;
-               val <<= 20;
-               total_mem += val;
-               socket_arg[i] = val;
-       }
-
-       return 0;
-}
-
-static int
-eal_parse_base_virtaddr(const char *arg)
-{
-       char *end;
-       uint64_t addr;
-
-       errno = 0;
-       addr = strtoull(arg, &end, 16);
-
-       /* check for errors */
-       if ((errno != 0) || (arg[0] == '\0') || end == NULL || (*end != '\0'))
-               return -1;
-
-       /* make sure we don't exceed 32-bit boundary on 32-bit target */
-#ifndef RTE_ARCH_64
-       if (addr >= UINTPTR_MAX)
-               return -1;
-#endif
-
-       /* align the addr on 16M boundary, 16MB is the minimum huge page
-        * size on IBM Power architecture. If the addr is aligned to 16MB,
-        * it can align to 2MB for x86. So this alignment can also be used
-        * on x86 */
-       internal_config.base_virtaddr =
-               RTE_PTR_ALIGN_CEIL((uintptr_t)addr, (size_t)RTE_PGSIZE_16M);
-
-       return 0;
-}
-
-static int
-eal_parse_vfio_intr(const char *mode)
-{
-       unsigned i;
-       static struct {
-               const char *name;
-               enum rte_intr_mode value;
-       } map[] = {
-               { "legacy", RTE_INTR_MODE_LEGACY },
-               { "msi", RTE_INTR_MODE_MSI },
-               { "msix", RTE_INTR_MODE_MSIX },
-       };
-
-       for (i = 0; i < RTE_DIM(map); i++) {
-               if (!strcmp(mode, map[i].name)) {
-                       internal_config.vfio_intr_mode = map[i].value;
-                       return 0;
-               }
-       }
-       return -1;
-}
-
-/* Parse the arguments for --log-level only */
-static void
-eal_log_level_parse(int argc, char **argv)
-{
-       int opt;
-       char **argvopt;
-       int option_index;
-       const int old_optind = optind;
-       const int old_optopt = optopt;
-       char * const old_optarg = optarg;
-
-       argvopt = argv;
-       optind = 1;
-
-       while ((opt = getopt_long(argc, argvopt, eal_short_options,
-                                 eal_long_options, &option_index)) != EOF) {
-
-               int ret;
-
-               /* getopt is not happy, stop right now */
-               if (opt == '?')
-                       break;
-
-               ret = (opt == OPT_LOG_LEVEL_NUM) ?
-                       eal_parse_common_option(opt, optarg, &internal_config) : 0;
-
-               /* common parser is not happy */
-               if (ret < 0)
-                       break;
-       }
-
-       /* restore getopt lib */
-       optind = old_optind;
-       optopt = old_optopt;
-       optarg = old_optarg;
-}
-
-/* Parse the argument given in the command line of the application */
-static int
-eal_parse_args(int argc, char **argv)
-{
-       int opt, ret;
-       char **argvopt;
-       int option_index;
-       char *prgname = argv[0];
-       const int old_optind = optind;
-       const int old_optopt = optopt;
-       char * const old_optarg = optarg;
-
-       argvopt = argv;
-       optind = 1;
-       opterr = 0;
-
-       while ((opt = getopt_long(argc, argvopt, eal_short_options,
-                                 eal_long_options, &option_index)) != EOF) {
-
-               /*
-                * getopt didn't recognise the option, lets parse the
-                * registered options to see if the flag is valid
-                */
-               if (opt == '?') {
-                       ret = rte_option_parse(argv[optind-1]);
-                       if (ret == 0)
-                               continue;
-
-                       eal_usage(prgname);
-                       ret = -1;
-                       goto out;
-               }
-
-               ret = eal_parse_common_option(opt, optarg, &internal_config);
-               /* common parser is not happy */
-               if (ret < 0) {
-                       eal_usage(prgname);
-                       ret = -1;
-                       goto out;
-               }
-               /* common parser handled this option */
-               if (ret == 0)
-                       continue;
-
-               switch (opt) {
-               case 'h':
-                       eal_usage(prgname);
-                       exit(EXIT_SUCCESS);
-
-               case OPT_HUGE_DIR_NUM:
-               {
-                       char *hdir = strdup(optarg);
-                       if (hdir == NULL)
-                               RTE_LOG(ERR, EAL, "Could not store hugepage directory\n");
-                       else {
-                               /* free old hugepage dir */
-                               if (internal_config.hugepage_dir != NULL)
-                                       free(internal_config.hugepage_dir);
-                               internal_config.hugepage_dir = hdir;
-                       }
-                       break;
-               }
-               case OPT_FILE_PREFIX_NUM:
-               {
-                       char *prefix = strdup(optarg);
-                       if (prefix == NULL)
-                               RTE_LOG(ERR, EAL, "Could not store file prefix\n");
-                       else {
-                               /* free old prefix */
-                               if (internal_config.hugefile_prefix != NULL)
-                                       free(internal_config.hugefile_prefix);
-                               internal_config.hugefile_prefix = prefix;
-                       }
-                       break;
-               }
-               case OPT_SOCKET_MEM_NUM:
-                       if (eal_parse_socket_arg(optarg,
-                                       internal_config.socket_mem) < 0) {
-                               RTE_LOG(ERR, EAL, "invalid parameters for --"
-                                               OPT_SOCKET_MEM "\n");
-                               eal_usage(prgname);
-                               ret = -1;
-                               goto out;
-                       }
-                       internal_config.force_sockets = 1;
-                       break;
-
-               case OPT_SOCKET_LIMIT_NUM:
-                       if (eal_parse_socket_arg(optarg,
-                                       internal_config.socket_limit) < 0) {
-                               RTE_LOG(ERR, EAL, "invalid parameters for --"
-                                               OPT_SOCKET_LIMIT "\n");
-                               eal_usage(prgname);
-                               ret = -1;
-                               goto out;
-                       }
-                       internal_config.force_socket_limits = 1;
-                       break;
-
-               case OPT_BASE_VIRTADDR_NUM:
-                       if (eal_parse_base_virtaddr(optarg) < 0) {
-                               RTE_LOG(ERR, EAL, "invalid parameter for --"
-                                               OPT_BASE_VIRTADDR "\n");
-                               eal_usage(prgname);
-                               ret = -1;
-                               goto out;
-                       }
-                       break;
-
-               case OPT_VFIO_INTR_NUM:
-                       if (eal_parse_vfio_intr(optarg) < 0) {
-                               RTE_LOG(ERR, EAL, "invalid parameters for --"
-                                               OPT_VFIO_INTR "\n");
-                               eal_usage(prgname);
-                               ret = -1;
-                               goto out;
-                       }
-                       break;
-
-               case OPT_CREATE_UIO_DEV_NUM:
-                       internal_config.create_uio_dev = 1;
-                       break;
-
-               case OPT_MBUF_POOL_OPS_NAME_NUM:
-               {
-                       char *ops_name = strdup(optarg);
-                       if (ops_name == NULL)
-                               RTE_LOG(ERR, EAL, "Could not store mbuf pool ops name\n");
-                       else {
-                               /* free old ops name */
-                               if (internal_config.user_mbuf_pool_ops_name !=
-                                               NULL)
-                                       free(internal_config.user_mbuf_pool_ops_name);
-
-                               internal_config.user_mbuf_pool_ops_name =
-                                               ops_name;
-                       }
-                       break;
-               }
-               case OPT_MATCH_ALLOCATIONS_NUM:
-                       internal_config.match_allocations = 1;
-                       break;
-
-               default:
-                       if (opt < OPT_LONG_MIN_NUM && isprint(opt)) {
-                               RTE_LOG(ERR, EAL, "Option %c is not supported "
-                                       "on Linux\n", opt);
-                       } else if (opt >= OPT_LONG_MIN_NUM &&
-                                  opt < OPT_LONG_MAX_NUM) {
-                               RTE_LOG(ERR, EAL, "Option %s is not supported "
-                                       "on Linux\n",
-                                       eal_long_options[option_index].name);
-                       } else {
-                               RTE_LOG(ERR, EAL, "Option %d is not supported "
-                                       "on Linux\n", opt);
-                       }
-                       eal_usage(prgname);
-                       ret = -1;
-                       goto out;
-               }
-       }
-
-       /* create runtime data directory */
-       if (internal_config.no_shconf == 0 &&
-                       eal_create_runtime_dir() < 0) {
-               RTE_LOG(ERR, EAL, "Cannot create runtime directory\n");
-               ret = -1;
-               goto out;
-       }
-
-       if (eal_adjust_config(&internal_config) != 0) {
-               ret = -1;
-               goto out;
-       }
-
-       /* sanity checks */
-       if (eal_check_common_options(&internal_config) != 0) {
-               eal_usage(prgname);
-               ret = -1;
-               goto out;
-       }
-
-       if (optind >= 0)
-               argv[optind-1] = prgname;
-       ret = optind-1;
-
-out:
-       /* restore getopt lib */
-       optind = old_optind;
-       optopt = old_optopt;
-       optarg = old_optarg;
-
-       return ret;
-}
-
-static int
-check_socket(const struct rte_memseg_list *msl, void *arg)
-{
-       int *socket_id = arg;
-
-       if (msl->external)
-               return 0;
-
-       return *socket_id == msl->socket_id;
-}
-
-static void
-eal_check_mem_on_local_socket(void)
-{
-       int socket_id;
-
-       socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
-
-       if (rte_memseg_list_walk(check_socket, &socket_id) == 0)
-               RTE_LOG(WARNING, EAL, "WARNING: Master core has no memory on local socket!\n");
-}
-
-static int
-sync_func(__attribute__((unused)) void *arg)
-{
-       return 0;
-}
-
-inline static void
-rte_eal_mcfg_complete(void)
-{
-       /* ALL shared mem_config related INIT DONE */
-       if (rte_config.process_type == RTE_PROC_PRIMARY)
-               rte_config.mem_config->magic = RTE_MAGIC;
-
-       internal_config.init_complete = 1;
-}
-
-/*
- * Request iopl privilege for all RPL, returns 0 on success
- * iopl() call is mostly for the i386 architecture. For other architectures,
- * return -1 to indicate IO privilege can't be changed in this way.
- */
-int
-rte_eal_iopl_init(void)
-{
-#if defined(RTE_ARCH_X86)
-       if (iopl(3) != 0)
-               return -1;
-#endif
-       return 0;
-}
-
-#ifdef VFIO_PRESENT
-static int rte_eal_vfio_setup(void)
-{
-       if (rte_vfio_enable("vfio"))
-               return -1;
-
-       return 0;
-}
-#endif
-
-static void rte_eal_init_alert(const char *msg)
-{
-       fprintf(stderr, "EAL: FATAL: %s\n", msg);
-       RTE_LOG(ERR, EAL, "%s\n", msg);
-}
-
-/* Launch threads, called at application init(). */
-int
-rte_eal_init(int argc, char **argv)
-{
-       int i, fctret, ret;
-       pthread_t thread_id;
-       static rte_atomic32_t run_once = RTE_ATOMIC32_INIT(0);
-       const char *p;
-       static char logid[PATH_MAX];
-       char cpuset[RTE_CPU_AFFINITY_STR_LEN];
-       char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-       /* checks if the machine is adequate */
-       if (!rte_cpu_is_supported()) {
-               rte_eal_init_alert("unsupported cpu type.");
-               rte_errno = ENOTSUP;
-               return -1;
-       }
-
-       if (!rte_atomic32_test_and_set(&run_once)) {
-               rte_eal_init_alert("already called initialization.");
-               rte_errno = EALREADY;
-               return -1;
-       }
-
-       p = strrchr(argv[0], '/');
-       strlcpy(logid, p ? p + 1 : argv[0], sizeof(logid));
-       thread_id = pthread_self();
-
-       eal_reset_internal_config(&internal_config);
-
-       /* set log level as early as possible */
-       eal_log_level_parse(argc, argv);
-
-       if (rte_eal_cpu_init() < 0) {
-               rte_eal_init_alert("Cannot detect lcores.");
-               rte_errno = ENOTSUP;
-               return -1;
-       }
-
-       fctret = eal_parse_args(argc, argv);
-       if (fctret < 0) {
-               rte_eal_init_alert("Invalid 'command line' arguments.");
-               rte_errno = EINVAL;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-
-       if (eal_plugins_init() < 0) {
-               rte_eal_init_alert("Cannot init plugins");
-               rte_errno = EINVAL;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-
-       if (eal_option_device_parse()) {
-               rte_errno = ENODEV;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-
-       rte_config_init();
-
-       if (rte_eal_intr_init() < 0) {
-               rte_eal_init_alert("Cannot init interrupt-handling thread");
-               return -1;
-       }
-
-       /* Put mp channel init before bus scan so that we can init the vdev
-        * bus through mp channel in the secondary process before the bus scan.
-        */
-       if (rte_mp_channel_init() < 0) {
-               rte_eal_init_alert("failed to init mp channel");
-               if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
-                       rte_errno = EFAULT;
-                       return -1;
-               }
-       }
-
-       /* register multi-process action callbacks for hotplug */
-       if (rte_mp_dev_hotplug_init() < 0) {
-               rte_eal_init_alert("failed to register mp callback for hotplug");
-               return -1;
-       }
-
-       if (rte_bus_scan()) {
-               rte_eal_init_alert("Cannot scan the buses for devices");
-               rte_errno = ENODEV;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-
-       /* if no EAL option "--iova-mode=<pa|va>", use bus IOVA scheme */
-       if (internal_config.iova_mode == RTE_IOVA_DC) {
-               /* autodetect the IOVA mapping mode (default is RTE_IOVA_PA) */
-               rte_eal_get_configuration()->iova_mode =
-                       rte_bus_get_iommu_class();
-
-               /* Workaround for KNI which requires physical address to work */
-               if (rte_eal_get_configuration()->iova_mode == RTE_IOVA_VA &&
-                               rte_eal_check_module("rte_kni") == 1) {
-                       rte_eal_get_configuration()->iova_mode = RTE_IOVA_PA;
-                       RTE_LOG(WARNING, EAL,
-                               "Some devices want IOVA as VA but PA will be used because.. "
-                               "KNI module inserted\n");
-               }
-       } else {
-               rte_eal_get_configuration()->iova_mode =
-                       internal_config.iova_mode;
-       }
-
-       if (internal_config.no_hugetlbfs == 0) {
-               /* rte_config isn't initialized yet */
-               ret = internal_config.process_type == RTE_PROC_PRIMARY ?
-                               eal_hugepage_info_init() :
-                               eal_hugepage_info_read();
-               if (ret < 0) {
-                       rte_eal_init_alert("Cannot get hugepage information.");
-                       rte_errno = EACCES;
-                       rte_atomic32_clear(&run_once);
-                       return -1;
-               }
-       }
-
-       if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
-               if (internal_config.no_hugetlbfs)
-                       internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
-       }
-
-       if (internal_config.vmware_tsc_map == 1) {
-#ifdef RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT
-               rte_cycles_vmware_tsc_map = 1;
-               RTE_LOG (DEBUG, EAL, "Using VMWARE TSC MAP, "
-                               "you must have monitor_control.pseudo_perfctr = TRUE\n");
-#else
-               RTE_LOG (WARNING, EAL, "Ignoring --vmware-tsc-map because "
-                               "RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT is not set\n");
-#endif
-       }
-
-       rte_srand(rte_rdtsc());
-
-       if (rte_eal_log_init(logid, internal_config.syslog_facility) < 0) {
-               rte_eal_init_alert("Cannot init logging.");
-               rte_errno = ENOMEM;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-
-#ifdef VFIO_PRESENT
-       if (rte_eal_vfio_setup() < 0) {
-               rte_eal_init_alert("Cannot init VFIO");
-               rte_errno = EAGAIN;
-               rte_atomic32_clear(&run_once);
-               return -1;
-       }
-#endif
-       /* in secondary processes, memory init may allocate additional fbarrays
-        * not present in primary processes, so to avoid any potential issues,
-        * initialize memzones first.
-        */
-       if (rte_eal_memzone_init() < 0) {
-               rte_eal_init_alert("Cannot init memzone");
-               rte_errno = ENODEV;
-               return -1;
-       }
-
-       if (rte_eal_memory_init() < 0) {
-               rte_eal_init_alert("Cannot init memory");
-               rte_errno = ENOMEM;
-               return -1;
-       }
-
-       /* the directories are locked during eal_hugepage_info_init */
-       eal_hugedirs_unlock();
-
-       if (rte_eal_malloc_heap_init() < 0) {
-               rte_eal_init_alert("Cannot init malloc heap");
-               rte_errno = ENODEV;
-               return -1;
-       }
-
-       if (rte_eal_tailqs_init() < 0) {
-               rte_eal_init_alert("Cannot init tail queues for objects");
-               rte_errno = EFAULT;
-               return -1;
-       }
-
-       if (rte_eal_alarm_init() < 0) {
-               rte_eal_init_alert("Cannot init interrupt-handling thread");
-               /* rte_eal_alarm_init sets rte_errno on failure. */
-               return -1;
-       }
-
-       if (rte_eal_timer_init() < 0) {
-               rte_eal_init_alert("Cannot init HPET or TSC timers");
-               rte_errno = ENOTSUP;
-               return -1;
-       }
-
-       eal_check_mem_on_local_socket();
-
-       eal_thread_init_master(rte_config.master_lcore);
-
-       ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
-       RTE_LOG(DEBUG, EAL, "Master lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
-               rte_config.master_lcore, (uintptr_t)thread_id, cpuset,
-               ret == 0 ? "" : "...");
-
-       RTE_LCORE_FOREACH_SLAVE(i) {
-
-               /*
-                * create communication pipes between master thread
-                * and children
-                */
-               if (pipe(lcore_config[i].pipe_master2slave) < 0)
-                       rte_panic("Cannot create pipe\n");
-               if (pipe(lcore_config[i].pipe_slave2master) < 0)
-                       rte_panic("Cannot create pipe\n");
-
-               lcore_config[i].state = WAIT;
-
-               /* create a thread for each lcore */
-               ret = pthread_create(&lcore_config[i].thread_id, NULL,
-                                    eal_thread_loop, NULL);
-               if (ret != 0)
-                       rte_panic("Cannot create thread\n");
-
-               /* Set thread_name for aid in debugging. */
-               snprintf(thread_name, sizeof(thread_name),
-                       "lcore-slave-%d", i);
-               ret = rte_thread_setname(lcore_config[i].thread_id,
-                                               thread_name);
-               if (ret != 0)
-                       RTE_LOG(DEBUG, EAL,
-                               "Cannot set name for lcore thread\n");
-       }
-
-       /*
-        * Launch a dummy function on all slave lcores, so that master lcore
-        * knows they are all ready when this function returns.
-        */
-       rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
-       rte_eal_mp_wait_lcore();
-
-       /* initialize services so vdevs register service during bus_probe. */
-       ret = rte_service_init();
-       if (ret) {
-               rte_eal_init_alert("rte_service_init() failed");
-               rte_errno = ENOEXEC;
-               return -1;
-       }
-
-       /* Probe all the buses and devices/drivers on them */
-       if (rte_bus_probe()) {
-               rte_eal_init_alert("Cannot probe devices");
-               rte_errno = ENOTSUP;
-               return -1;
-       }
-
-#ifdef VFIO_PRESENT
-       /* Register mp action after probe() so that we got enough info */
-       if (rte_vfio_is_enabled("vfio") && vfio_mp_sync_setup() < 0)
-               return -1;
-#endif
-
-       /* initialize default service/lcore mappings and start running. Ignore
-        * -ENOTSUP, as it indicates no service coremask passed to EAL.
-        */
-       ret = rte_service_start_with_defaults();
-       if (ret < 0 && ret != -ENOTSUP) {
-               rte_errno = ENOEXEC;
-               return -1;
-       }
-
-       /*
-        * Clean up unused files in runtime directory. We do this at the end of
-        * init and not at the beginning because we want to clean stuff up
-        * whether we are primary or secondary process, but we cannot remove
-        * primary process' files because secondary should be able to run even
-        * if primary process is dead.
-        *
-        * In no_shconf mode, no runtime directory is created in the first
-        * place, so no cleanup needed.
-        */
-       if (!internal_config.no_shconf && eal_clean_runtime_dir() < 0) {
-               rte_eal_init_alert("Cannot clear runtime directory\n");
-               return -1;
-       }
-
-       rte_eal_mcfg_complete();
-
-       /* Call each registered callback, if enabled */
-       rte_option_init();
-
-       return fctret;
-}
-
-static int
-mark_freeable(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
-               void *arg __rte_unused)
-{
-       /* ms is const, so find this memseg */
-       struct rte_memseg *found;
-
-       if (msl->external)
-               return 0;
-
-       found = rte_mem_virt2memseg(ms->addr, msl);
-
-       found->flags &= ~RTE_MEMSEG_FLAG_DO_NOT_FREE;
-
-       return 0;
-}
-
-int __rte_experimental
-rte_eal_cleanup(void)
-{
-       /* if we're in a primary process, we need to mark hugepages as freeable
-        * so that finalization can release them back to the system.
-        */
-       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-               rte_memseg_walk(mark_freeable, NULL);
-       rte_service_finalize();
-       rte_mp_channel_cleanup();
-       eal_cleanup_config(&internal_config);
-       return 0;
-}
-
-/* get core role */
-enum rte_lcore_role_t
-rte_eal_lcore_role(unsigned lcore_id)
-{
-       return rte_config.lcore_role[lcore_id];
-}
-
-enum rte_proc_type_t
-rte_eal_process_type(void)
-{
-       return rte_config.process_type;
-}
-
-int rte_eal_has_hugepages(void)
-{
-       return ! internal_config.no_hugetlbfs;
-}
-
-int rte_eal_has_pci(void)
-{
-       return !internal_config.no_pci;
-}
-
-int rte_eal_create_uio_dev(void)
-{
-       return internal_config.create_uio_dev;
-}
-
-enum rte_intr_mode
-rte_eal_vfio_intr_mode(void)
-{
-       return internal_config.vfio_intr_mode;
-}
-
-int
-rte_eal_check_module(const char *module_name)
-{
-       char sysfs_mod_name[PATH_MAX];
-       struct stat st;
-       int n;
-
-       if (NULL == module_name)
-               return -1;
-
-       /* Check if there is sysfs mounted */
-       if (stat("/sys/module", &st) != 0) {
-               RTE_LOG(DEBUG, EAL, "sysfs is not mounted! error %i (%s)\n",
-                       errno, strerror(errno));
-               return -1;
-       }
-
-       /* A module might be built-in, therefore try sysfs */
-       n = snprintf(sysfs_mod_name, PATH_MAX, "/sys/module/%s", module_name);
-       if (n < 0 || n > PATH_MAX) {
-               RTE_LOG(DEBUG, EAL, "Could not format module path\n");
-               return -1;
-       }
-
-       if (stat(sysfs_mod_name, &st) != 0) {
-               RTE_LOG(DEBUG, EAL, "Module %s not found! error %i (%s)\n",
-                       sysfs_mod_name, errno, strerror(errno));
-               return 0;
-       }
-
-       /* Module has been found */
-       return 1;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c b/lib/librte_eal/linuxapp/eal/eal_alarm.c
deleted file mode 100644 (file)
index 840ede7..0000000
+++ /dev/null
@@ -1,243 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-#include <stdio.h>
-#include <stdint.h>
-#include <signal.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/queue.h>
-#include <sys/time.h>
-#include <sys/timerfd.h>
-
-#include <rte_memory.h>
-#include <rte_interrupts.h>
-#include <rte_alarm.h>
-#include <rte_common.h>
-#include <rte_per_lcore.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_lcore.h>
-#include <rte_errno.h>
-#include <rte_spinlock.h>
-#include <eal_private.h>
-
-#ifndef        TFD_NONBLOCK
-#include <fcntl.h>
-#define        TFD_NONBLOCK    O_NONBLOCK
-#endif
-
-#define NS_PER_US 1000
-#define US_PER_MS 1000
-#define MS_PER_S 1000
-#ifndef US_PER_S
-#define US_PER_S (US_PER_MS * MS_PER_S)
-#endif
-
-#ifdef CLOCK_MONOTONIC_RAW /* Defined in glibc bits/time.h */
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC_RAW
-#else
-#define CLOCK_TYPE_ID CLOCK_MONOTONIC
-#endif
-
-struct alarm_entry {
-       LIST_ENTRY(alarm_entry) next;
-       struct timeval time;
-       rte_eal_alarm_callback cb_fn;
-       void *cb_arg;
-       volatile uint8_t executing;
-       volatile pthread_t executing_id;
-};
-
-static LIST_HEAD(alarm_list, alarm_entry) alarm_list = LIST_HEAD_INITIALIZER();
-static rte_spinlock_t alarm_list_lk = RTE_SPINLOCK_INITIALIZER;
-
-static struct rte_intr_handle intr_handle = {.fd = -1 };
-static int handler_registered = 0;
-static void eal_alarm_callback(void *arg);
-
-int
-rte_eal_alarm_init(void)
-{
-       intr_handle.type = RTE_INTR_HANDLE_ALARM;
-       /* create a timerfd file descriptor */
-       intr_handle.fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
-       if (intr_handle.fd == -1)
-               goto error;
-
-       return 0;
-
-error:
-       rte_errno = errno;
-       return -1;
-}
-
-static void
-eal_alarm_callback(void *arg __rte_unused)
-{
-       struct timespec now;
-       struct alarm_entry *ap;
-
-       rte_spinlock_lock(&alarm_list_lk);
-       while ((ap = LIST_FIRST(&alarm_list)) !=NULL &&
-                       clock_gettime(CLOCK_TYPE_ID, &now) == 0 &&
-                       (ap->time.tv_sec < now.tv_sec || (ap->time.tv_sec == now.tv_sec &&
-                                               (ap->time.tv_usec * NS_PER_US) <= now.tv_nsec))) {
-               ap->executing = 1;
-               ap->executing_id = pthread_self();
-               rte_spinlock_unlock(&alarm_list_lk);
-
-               ap->cb_fn(ap->cb_arg);
-
-               rte_spinlock_lock(&alarm_list_lk);
-
-               LIST_REMOVE(ap, next);
-               free(ap);
-       }
-
-       if (!LIST_EMPTY(&alarm_list)) {
-               struct itimerspec atime = { .it_interval = { 0, 0 } };
-
-               ap = LIST_FIRST(&alarm_list);
-               atime.it_value.tv_sec = ap->time.tv_sec;
-               atime.it_value.tv_nsec = ap->time.tv_usec * NS_PER_US;
-               /* perform borrow for subtraction if necessary */
-               if (now.tv_nsec > (ap->time.tv_usec * NS_PER_US))
-                       atime.it_value.tv_sec--, atime.it_value.tv_nsec += US_PER_S * NS_PER_US;
-
-               atime.it_value.tv_sec -= now.tv_sec;
-               atime.it_value.tv_nsec -= now.tv_nsec;
-               timerfd_settime(intr_handle.fd, 0, &atime, NULL);
-       }
-       rte_spinlock_unlock(&alarm_list_lk);
-}
-
-int
-rte_eal_alarm_set(uint64_t us, rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
-       struct timespec now;
-       int ret = 0;
-       struct alarm_entry *ap, *new_alarm;
-
-       /* Check parameters, including that us won't cause a uint64_t overflow */
-       if (us < 1 || us > (UINT64_MAX - US_PER_S) || cb_fn == NULL)
-               return -EINVAL;
-
-       new_alarm = calloc(1, sizeof(*new_alarm));
-       if (new_alarm == NULL)
-               return -ENOMEM;
-
-       /* use current time to calculate absolute time of alarm */
-       clock_gettime(CLOCK_TYPE_ID, &now);
-
-       new_alarm->cb_fn = cb_fn;
-       new_alarm->cb_arg = cb_arg;
-       new_alarm->time.tv_usec = ((now.tv_nsec / NS_PER_US) + us) % US_PER_S;
-       new_alarm->time.tv_sec = now.tv_sec + (((now.tv_nsec / NS_PER_US) + us) / US_PER_S);
-
-       rte_spinlock_lock(&alarm_list_lk);
-       if (!handler_registered) {
-               ret |= rte_intr_callback_register(&intr_handle,
-                               eal_alarm_callback, NULL);
-               handler_registered = (ret == 0) ? 1 : 0;
-       }
-
-       if (LIST_EMPTY(&alarm_list))
-               LIST_INSERT_HEAD(&alarm_list, new_alarm, next);
-       else {
-               LIST_FOREACH(ap, &alarm_list, next) {
-                       if (ap->time.tv_sec > new_alarm->time.tv_sec ||
-                                       (ap->time.tv_sec == new_alarm->time.tv_sec &&
-                                                       ap->time.tv_usec > new_alarm->time.tv_usec)){
-                               LIST_INSERT_BEFORE(ap, new_alarm, next);
-                               break;
-                       }
-                       if (LIST_NEXT(ap, next) == NULL) {
-                               LIST_INSERT_AFTER(ap, new_alarm, next);
-                               break;
-                       }
-               }
-       }
-
-       if (LIST_FIRST(&alarm_list) == new_alarm) {
-               struct itimerspec alarm_time = {
-                       .it_interval = {0, 0},
-                       .it_value = {
-                               .tv_sec = us / US_PER_S,
-                               .tv_nsec = (us % US_PER_S) * NS_PER_US,
-                       },
-               };
-               ret |= timerfd_settime(intr_handle.fd, 0, &alarm_time, NULL);
-       }
-       rte_spinlock_unlock(&alarm_list_lk);
-
-       return ret;
-}
-
-int
-rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, void *cb_arg)
-{
-       struct alarm_entry *ap, *ap_prev;
-       int count = 0;
-       int err = 0;
-       int executing;
-
-       if (!cb_fn) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       do {
-               executing = 0;
-               rte_spinlock_lock(&alarm_list_lk);
-               /* remove any matches at the start of the list */
-               while ((ap = LIST_FIRST(&alarm_list)) != NULL &&
-                               cb_fn == ap->cb_fn &&
-                               (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
-
-                       if (ap->executing == 0) {
-                               LIST_REMOVE(ap, next);
-                               free(ap);
-                               count++;
-                       } else {
-                               /* If calling from other context, mark that alarm is executing
-                                * so loop can spin till it finish. Otherwise we are trying to
-                                * cancel our self - mark it by EINPROGRESS */
-                               if (pthread_equal(ap->executing_id, pthread_self()) == 0)
-                                       executing++;
-                               else
-                                       err = EINPROGRESS;
-
-                               break;
-                       }
-               }
-               ap_prev = ap;
-
-               /* now go through list, removing entries not at start */
-               LIST_FOREACH(ap, &alarm_list, next) {
-                       /* this won't be true first time through */
-                       if (cb_fn == ap->cb_fn &&
-                                       (cb_arg == (void *)-1 || cb_arg == ap->cb_arg)) {
-
-                               if (ap->executing == 0) {
-                                       LIST_REMOVE(ap, next);
-                                       free(ap);
-                                       count++;
-                                       ap = ap_prev;
-                               } else if (pthread_equal(ap->executing_id, pthread_self()) == 0)
-                                       executing++;
-                               else
-                                       err = EINPROGRESS;
-                       }
-                       ap_prev = ap;
-               }
-               rte_spinlock_unlock(&alarm_list_lk);
-       } while (executing != 0);
-
-       if (count == 0 && err == 0)
-               rte_errno = ENOENT;
-       else if (err)
-               rte_errno = err;
-
-       return count;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_cpuflags.c b/lib/librte_eal/linuxapp/eal/eal_cpuflags.c
deleted file mode 100644 (file)
index d38296e..0000000
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright 2018 Red Hat, Inc.
- */
-
-#include <elf.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 16)
-#include <sys/auxv.h>
-#define HAS_AUXV 1
-#endif
-#endif
-
-#include <rte_cpuflags.h>
-
-#ifndef HAS_AUXV
-static unsigned long
-getauxval(unsigned long type __rte_unused)
-{
-       errno = ENOTSUP;
-       return 0;
-}
-#endif
-
-#ifdef RTE_ARCH_64
-typedef Elf64_auxv_t Internal_Elfx_auxv_t;
-#else
-typedef Elf32_auxv_t Internal_Elfx_auxv_t;
-#endif
-
-/**
- * Provides a method for retrieving values from the auxiliary vector and
- * possibly running a string comparison.
- *
- * @return Always returns a result.  When the result is 0, check errno
- * to see if an error occurred during processing.
- */
-static unsigned long
-_rte_cpu_getauxval(unsigned long type, const char *str)
-{
-       unsigned long val;
-
-       errno = 0;
-       val = getauxval(type);
-
-       if (!val && (errno == ENOTSUP || errno == ENOENT)) {
-               int auxv_fd = open("/proc/self/auxv", O_RDONLY);
-               Internal_Elfx_auxv_t auxv;
-
-               if (auxv_fd == -1)
-                       return 0;
-
-               errno = ENOENT;
-               while (read(auxv_fd, &auxv, sizeof(auxv)) == sizeof(auxv)) {
-                       if (auxv.a_type == type) {
-                               errno = 0;
-                               val = auxv.a_un.a_val;
-                               if (str)
-                                       val = strcmp((const char *)val, str);
-                               break;
-                       }
-               }
-               close(auxv_fd);
-       }
-
-       return val;
-}
-
-unsigned long
-rte_cpu_getauxval(unsigned long type)
-{
-       return _rte_cpu_getauxval(type, NULL);
-}
-
-int
-rte_cpu_strcmp_auxval(unsigned long type, const char *str)
-{
-       return _rte_cpu_getauxval(type, str);
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_debug.c b/lib/librte_eal/linuxapp/eal/eal_debug.c
deleted file mode 100644 (file)
index 5d92500..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#ifdef RTE_BACKTRACE
-#include <execinfo.h>
-#endif
-#include <stdarg.h>
-#include <signal.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-
-#include <rte_log.h>
-#include <rte_debug.h>
-#include <rte_common.h>
-#include <rte_eal.h>
-
-#define BACKTRACE_SIZE 256
-
-/* dump the stack of the calling core */
-void rte_dump_stack(void)
-{
-#ifdef RTE_BACKTRACE
-       void *func[BACKTRACE_SIZE];
-       char **symb = NULL;
-       int size;
-
-       size = backtrace(func, BACKTRACE_SIZE);
-       symb = backtrace_symbols(func, size);
-
-       if (symb == NULL)
-               return;
-
-       while (size > 0) {
-               rte_log(RTE_LOG_ERR, RTE_LOGTYPE_EAL,
-                       "%d: [%s]\n", size, symb[size - 1]);
-               size --;
-       }
-
-       free(symb);
-#endif /* RTE_BACKTRACE */
-}
-
-/* not implemented in this environment */
-void rte_dump_registers(void)
-{
-       return;
-}
-
-/* call abort(), it will generate a coredump if enabled */
-void __rte_panic(const char *funcname, const char *format, ...)
-{
-       va_list ap;
-
-       rte_log(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, "PANIC in %s():\n", funcname);
-       va_start(ap, format);
-       rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
-       va_end(ap);
-       rte_dump_stack();
-       rte_dump_registers();
-       abort();
-}
-
-/*
- * Like rte_panic this terminates the application. However, no traceback is
- * provided and no core-dump is generated.
- */
-void
-rte_exit(int exit_code, const char *format, ...)
-{
-       va_list ap;
-
-       if (exit_code != 0)
-               RTE_LOG(CRIT, EAL, "Error - exiting with code: %d\n"
-                               "  Cause: ", exit_code);
-
-       va_start(ap, format);
-       rte_vlog(RTE_LOG_CRIT, RTE_LOGTYPE_EAL, format, ap);
-       va_end(ap);
-
-#ifndef RTE_EAL_ALWAYS_PANIC_ON_ERROR
-       if (rte_eal_cleanup() != 0)
-               RTE_LOG(CRIT, EAL,
-                       "EAL could not release all resources\n");
-       exit(exit_code);
-#else
-       rte_dump_stack();
-       rte_dump_registers();
-       abort();
-#endif
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c b/lib/librte_eal/linuxapp/eal/eal_dev.c
deleted file mode 100644 (file)
index 2830c86..0000000
+++ /dev/null
@@ -1,396 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2018 Intel Corporation
- */
-
-#include <string.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <signal.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-
-#include <rte_string_fns.h>
-#include <rte_log.h>
-#include <rte_compat.h>
-#include <rte_dev.h>
-#include <rte_malloc.h>
-#include <rte_interrupts.h>
-#include <rte_alarm.h>
-#include <rte_bus.h>
-#include <rte_eal.h>
-#include <rte_spinlock.h>
-#include <rte_errno.h>
-
-#include "eal_private.h"
-
-static struct rte_intr_handle intr_handle = {.fd = -1 };
-static bool monitor_started;
-static bool hotplug_handle;
-
-#define EAL_UEV_MSG_LEN 4096
-#define EAL_UEV_MSG_ELEM_LEN 128
-
-/*
- * spinlock for device hot-unplug failure handling. If it try to access bus or
- * device, such as handle sigbus on bus or handle memory failure for device
- * just need to use this lock. It could protect the bus and the device to avoid
- * race condition.
- */
-static rte_spinlock_t failure_handle_lock = RTE_SPINLOCK_INITIALIZER;
-
-static struct sigaction sigbus_action_old;
-
-static int sigbus_need_recover;
-
-static void dev_uev_handler(__rte_unused void *param);
-
-/* identify the system layer which reports this event. */
-enum eal_dev_event_subsystem {
-       EAL_DEV_EVENT_SUBSYSTEM_PCI, /* PCI bus device event */
-       EAL_DEV_EVENT_SUBSYSTEM_UIO, /* UIO driver device event */
-       EAL_DEV_EVENT_SUBSYSTEM_VFIO, /* VFIO driver device event */
-       EAL_DEV_EVENT_SUBSYSTEM_MAX
-};
-
-static void
-sigbus_action_recover(void)
-{
-       if (sigbus_need_recover) {
-               sigaction(SIGBUS, &sigbus_action_old, NULL);
-               sigbus_need_recover = 0;
-       }
-}
-
-static void sigbus_handler(int signum, siginfo_t *info,
-                               void *ctx __rte_unused)
-{
-       int ret;
-
-       RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
-               (int)pthread_self(), info->si_addr);
-
-       rte_spinlock_lock(&failure_handle_lock);
-       ret = rte_bus_sigbus_handler(info->si_addr);
-       rte_spinlock_unlock(&failure_handle_lock);
-       if (ret == -1) {
-               rte_exit(EXIT_FAILURE,
-                        "Failed to handle SIGBUS for hot-unplug, "
-                        "(rte_errno: %s)!", strerror(rte_errno));
-       } else if (ret == 1) {
-               if (sigbus_action_old.sa_flags == SA_SIGINFO
-                   && sigbus_action_old.sa_sigaction) {
-                       (*(sigbus_action_old.sa_sigaction))(signum,
-                                                           info, ctx);
-               } else if (sigbus_action_old.sa_flags != SA_SIGINFO
-                          && sigbus_action_old.sa_handler) {
-                       (*(sigbus_action_old.sa_handler))(signum);
-               } else {
-                       rte_exit(EXIT_FAILURE,
-                                "Failed to handle generic SIGBUS!");
-               }
-       }
-
-       RTE_LOG(DEBUG, EAL, "Success to handle SIGBUS for hot-unplug!\n");
-}
-
-static int cmp_dev_name(const struct rte_device *dev,
-       const void *_name)
-{
-       const char *name = _name;
-
-       return strcmp(dev->name, name);
-}
-
-static int
-dev_uev_socket_fd_create(void)
-{
-       struct sockaddr_nl addr;
-       int ret;
-
-       intr_handle.fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
-                       SOCK_NONBLOCK,
-                       NETLINK_KOBJECT_UEVENT);
-       if (intr_handle.fd < 0) {
-               RTE_LOG(ERR, EAL, "create uevent fd failed.\n");
-               return -1;
-       }
-
-       memset(&addr, 0, sizeof(addr));
-       addr.nl_family = AF_NETLINK;
-       addr.nl_pid = 0;
-       addr.nl_groups = 0xffffffff;
-
-       ret = bind(intr_handle.fd, (struct sockaddr *) &addr, sizeof(addr));
-       if (ret < 0) {
-               RTE_LOG(ERR, EAL, "Failed to bind uevent socket.\n");
-               goto err;
-       }
-
-       return 0;
-err:
-       close(intr_handle.fd);
-       intr_handle.fd = -1;
-       return ret;
-}
-
-static int
-dev_uev_parse(const char *buf, struct rte_dev_event *event, int length)
-{
-       char action[EAL_UEV_MSG_ELEM_LEN];
-       char subsystem[EAL_UEV_MSG_ELEM_LEN];
-       char pci_slot_name[EAL_UEV_MSG_ELEM_LEN];
-       int i = 0;
-
-       memset(action, 0, EAL_UEV_MSG_ELEM_LEN);
-       memset(subsystem, 0, EAL_UEV_MSG_ELEM_LEN);
-       memset(pci_slot_name, 0, EAL_UEV_MSG_ELEM_LEN);
-
-       while (i < length) {
-               for (; i < length; i++) {
-                       if (*buf)
-                               break;
-                       buf++;
-               }
-               /**
-                * check device uevent from kernel side, no need to check
-                * uevent from udev.
-                */
-               if (!strncmp(buf, "libudev", 7)) {
-                       buf += 7;
-                       i += 7;
-                       return -1;
-               }
-               if (!strncmp(buf, "ACTION=", 7)) {
-                       buf += 7;
-                       i += 7;
-                       strlcpy(action, buf, sizeof(action));
-               } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
-                       buf += 10;
-                       i += 10;
-                       strlcpy(subsystem, buf, sizeof(subsystem));
-               } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
-                       buf += 14;
-                       i += 14;
-                       strlcpy(pci_slot_name, buf, sizeof(subsystem));
-                       event->devname = strdup(pci_slot_name);
-               }
-               for (; i < length; i++) {
-                       if (*buf == '\0')
-                               break;
-                       buf++;
-               }
-       }
-
-       /* parse the subsystem layer */
-       if (!strncmp(subsystem, "uio", 3))
-               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_UIO;
-       else if (!strncmp(subsystem, "pci", 3))
-               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_PCI;
-       else if (!strncmp(subsystem, "vfio", 4))
-               event->subsystem = EAL_DEV_EVENT_SUBSYSTEM_VFIO;
-       else
-               return -1;
-
-       /* parse the action type */
-       if (!strncmp(action, "add", 3))
-               event->type = RTE_DEV_EVENT_ADD;
-       else if (!strncmp(action, "remove", 6))
-               event->type = RTE_DEV_EVENT_REMOVE;
-       else
-               return -1;
-       return 0;
-}
-
-static void
-dev_delayed_unregister(void *param)
-{
-       rte_intr_callback_unregister(&intr_handle, dev_uev_handler, param);
-       close(intr_handle.fd);
-       intr_handle.fd = -1;
-}
-
-static void
-dev_uev_handler(__rte_unused void *param)
-{
-       struct rte_dev_event uevent;
-       int ret;
-       char buf[EAL_UEV_MSG_LEN];
-       struct rte_bus *bus;
-       struct rte_device *dev;
-       const char *busname = "";
-
-       memset(&uevent, 0, sizeof(struct rte_dev_event));
-       memset(buf, 0, EAL_UEV_MSG_LEN);
-
-       ret = recv(intr_handle.fd, buf, EAL_UEV_MSG_LEN, MSG_DONTWAIT);
-       if (ret < 0 && errno == EAGAIN)
-               return;
-       else if (ret <= 0) {
-               /* connection is closed or broken, can not up again. */
-               RTE_LOG(ERR, EAL, "uevent socket connection is broken.\n");
-               rte_eal_alarm_set(1, dev_delayed_unregister, NULL);
-               return;
-       }
-
-       ret = dev_uev_parse(buf, &uevent, EAL_UEV_MSG_LEN);
-       if (ret < 0) {
-               RTE_LOG(DEBUG, EAL, "It is not an valid event "
-                       "that need to be handle.\n");
-               return;
-       }
-
-       RTE_LOG(DEBUG, EAL, "receive uevent(name:%s, type:%d, subsystem:%d)\n",
-               uevent.devname, uevent.type, uevent.subsystem);
-
-       switch (uevent.subsystem) {
-       case EAL_DEV_EVENT_SUBSYSTEM_PCI:
-       case EAL_DEV_EVENT_SUBSYSTEM_UIO:
-               busname = "pci";
-               break;
-       default:
-               break;
-       }
-
-       if (uevent.devname) {
-               if (uevent.type == RTE_DEV_EVENT_REMOVE && hotplug_handle) {
-                       rte_spinlock_lock(&failure_handle_lock);
-                       bus = rte_bus_find_by_name(busname);
-                       if (bus == NULL) {
-                               RTE_LOG(ERR, EAL, "Cannot find bus (%s)\n",
-                                       busname);
-                               goto failure_handle_err;
-                       }
-
-                       dev = bus->find_device(NULL, cmp_dev_name,
-                                              uevent.devname);
-                       if (dev == NULL) {
-                               RTE_LOG(ERR, EAL, "Cannot find device (%s) on "
-                                       "bus (%s)\n", uevent.devname, busname);
-                               goto failure_handle_err;
-                       }
-
-                       ret = bus->hot_unplug_handler(dev);
-                       if (ret) {
-                               RTE_LOG(ERR, EAL, "Can not handle hot-unplug "
-                                       "for device (%s)\n", dev->name);
-                       }
-                       rte_spinlock_unlock(&failure_handle_lock);
-               }
-               rte_dev_event_callback_process(uevent.devname, uevent.type);
-       }
-
-       return;
-
-failure_handle_err:
-       rte_spinlock_unlock(&failure_handle_lock);
-}
-
-int __rte_experimental
-rte_dev_event_monitor_start(void)
-{
-       int ret;
-
-       if (monitor_started)
-               return 0;
-
-       ret = dev_uev_socket_fd_create();
-       if (ret) {
-               RTE_LOG(ERR, EAL, "error create device event fd.\n");
-               return -1;
-       }
-
-       intr_handle.type = RTE_INTR_HANDLE_DEV_EVENT;
-       ret = rte_intr_callback_register(&intr_handle, dev_uev_handler, NULL);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "fail to register uevent callback.\n");
-               return -1;
-       }
-
-       monitor_started = true;
-
-       return 0;
-}
-
-int __rte_experimental
-rte_dev_event_monitor_stop(void)
-{
-       int ret;
-
-       if (!monitor_started)
-               return 0;
-
-       ret = rte_intr_callback_unregister(&intr_handle, dev_uev_handler,
-                                          (void *)-1);
-       if (ret < 0) {
-               RTE_LOG(ERR, EAL, "fail to unregister uevent callback.\n");
-               return ret;
-       }
-
-       close(intr_handle.fd);
-       intr_handle.fd = -1;
-       monitor_started = false;
-
-       return 0;
-}
-
-int
-dev_sigbus_handler_register(void)
-{
-       sigset_t mask;
-       struct sigaction action;
-
-       rte_errno = 0;
-
-       if (sigbus_need_recover)
-               return 0;
-
-       sigemptyset(&mask);
-       sigaddset(&mask, SIGBUS);
-       action.sa_flags = SA_SIGINFO;
-       action.sa_mask = mask;
-       action.sa_sigaction = sigbus_handler;
-       sigbus_need_recover = !sigaction(SIGBUS, &action, &sigbus_action_old);
-
-       return rte_errno;
-}
-
-int
-dev_sigbus_handler_unregister(void)
-{
-       rte_errno = 0;
-
-       sigbus_action_recover();
-
-       return rte_errno;
-}
-
-int __rte_experimental
-rte_dev_hotplug_handle_enable(void)
-{
-       int ret = 0;
-
-       ret = dev_sigbus_handler_register();
-       if (ret < 0)
-               RTE_LOG(ERR, EAL,
-                       "fail to register sigbus handler for devices.\n");
-
-       hotplug_handle = true;
-
-       return ret;
-}
-
-int __rte_experimental
-rte_dev_hotplug_handle_disable(void)
-{
-       int ret = 0;
-
-       ret = dev_sigbus_handler_unregister();
-       if (ret < 0)
-               RTE_LOG(ERR, EAL,
-                       "fail to unregister sigbus handler for devices.\n");
-
-       hotplug_handle = false;
-
-       return ret;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
deleted file mode 100644 (file)
index 0eab1cf..0000000
+++ /dev/null
@@ -1,526 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <string.h>
-#include <sys/types.h>
-#include <sys/file.h>
-#include <dirent.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <fnmatch.h>
-#include <inttypes.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <sys/stat.h>
-
-#include <linux/mman.h> /* for hugetlb-related flags */
-
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_debug.h>
-#include <rte_log.h>
-#include <rte_common.h>
-#include "rte_string_fns.h"
-#include "eal_internal_cfg.h"
-#include "eal_hugepages.h"
-#include "eal_filesystem.h"
-
-static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
-static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
-
-/*
- * Uses mmap to create a shared memory area for storage of data
- * Used in this file to store the hugepage file map on disk
- */
-static void *
-map_shared_memory(const char *filename, const size_t mem_size, int flags)
-{
-       void *retval;
-       int fd = open(filename, flags, 0666);
-       if (fd < 0)
-               return NULL;
-       if (ftruncate(fd, mem_size) < 0) {
-               close(fd);
-               return NULL;
-       }
-       retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
-                       MAP_SHARED, fd, 0);
-       close(fd);
-       return retval;
-}
-
-static void *
-open_shared_memory(const char *filename, const size_t mem_size)
-{
-       return map_shared_memory(filename, mem_size, O_RDWR);
-}
-
-static void *
-create_shared_memory(const char *filename, const size_t mem_size)
-{
-       return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
-}
-
-/* this function is only called from eal_hugepage_info_init which itself
- * is only called from a primary process */
-static uint32_t
-get_num_hugepages(const char *subdir)
-{
-       char path[PATH_MAX];
-       long unsigned resv_pages, num_pages = 0;
-       const char *nr_hp_file = "free_hugepages";
-       const char *nr_rsvd_file = "resv_hugepages";
-
-       /* first, check how many reserved pages kernel reports */
-       snprintf(path, sizeof(path), "%s/%s/%s",
-                       sys_dir_path, subdir, nr_rsvd_file);
-       if (eal_parse_sysfs_value(path, &resv_pages) < 0)
-               return 0;
-
-       snprintf(path, sizeof(path), "%s/%s/%s",
-                       sys_dir_path, subdir, nr_hp_file);
-       if (eal_parse_sysfs_value(path, &num_pages) < 0)
-               return 0;
-
-       if (num_pages == 0)
-               RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
-                               subdir);
-
-       /* adjust num_pages */
-       if (num_pages >= resv_pages)
-               num_pages -= resv_pages;
-       else if (resv_pages)
-               num_pages = 0;
-
-       /* we want to return a uint32_t and more than this looks suspicious
-        * anyway ... */
-       if (num_pages > UINT32_MAX)
-               num_pages = UINT32_MAX;
-
-       return num_pages;
-}
-
-static uint32_t
-get_num_hugepages_on_node(const char *subdir, unsigned int socket)
-{
-       char path[PATH_MAX], socketpath[PATH_MAX];
-       DIR *socketdir;
-       unsigned long num_pages = 0;
-       const char *nr_hp_file = "free_hugepages";
-
-       snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
-               sys_pages_numa_dir_path, socket);
-
-       socketdir = opendir(socketpath);
-       if (socketdir) {
-               /* Keep calm and carry on */
-               closedir(socketdir);
-       } else {
-               /* Can't find socket dir, so ignore it */
-               return 0;
-       }
-
-       snprintf(path, sizeof(path), "%s/%s/%s",
-                       socketpath, subdir, nr_hp_file);
-       if (eal_parse_sysfs_value(path, &num_pages) < 0)
-               return 0;
-
-       if (num_pages == 0)
-               RTE_LOG(WARNING, EAL, "No free hugepages reported in %s\n",
-                               subdir);
-
-       /*
-        * we want to return a uint32_t and more than this looks suspicious
-        * anyway ...
-        */
-       if (num_pages > UINT32_MAX)
-               num_pages = UINT32_MAX;
-
-       return num_pages;
-}
-
-static uint64_t
-get_default_hp_size(void)
-{
-       const char proc_meminfo[] = "/proc/meminfo";
-       const char str_hugepagesz[] = "Hugepagesize:";
-       unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
-       char buffer[256];
-       unsigned long long size = 0;
-
-       FILE *fd = fopen(proc_meminfo, "r");
-       if (fd == NULL)
-               rte_panic("Cannot open %s\n", proc_meminfo);
-       while(fgets(buffer, sizeof(buffer), fd)){
-               if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
-                       size = rte_str_to_size(&buffer[hugepagesz_len]);
-                       break;
-               }
-       }
-       fclose(fd);
-       if (size == 0)
-               rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
-       return size;
-}
-
-static int
-get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
-{
-       enum proc_mount_fieldnames {
-               DEVICE = 0,
-               MOUNTPT,
-               FSTYPE,
-               OPTIONS,
-               _FIELDNAME_MAX
-       };
-       static uint64_t default_size = 0;
-       const char proc_mounts[] = "/proc/mounts";
-       const char hugetlbfs_str[] = "hugetlbfs";
-       const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
-       const char pagesize_opt[] = "pagesize=";
-       const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
-       const char split_tok = ' ';
-       char *splitstr[_FIELDNAME_MAX];
-       char buf[BUFSIZ];
-       int retval = -1;
-
-       FILE *fd = fopen(proc_mounts, "r");
-       if (fd == NULL)
-               rte_panic("Cannot open %s\n", proc_mounts);
-
-       if (default_size == 0)
-               default_size = get_default_hp_size();
-
-       while (fgets(buf, sizeof(buf), fd)){
-               if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
-                               split_tok) != _FIELDNAME_MAX) {
-                       RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
-                       break; /* return NULL */
-               }
-
-               /* we have a specified --huge-dir option, only examine that dir */
-               if (internal_config.hugepage_dir != NULL &&
-                               strcmp(splitstr[MOUNTPT], internal_config.hugepage_dir) != 0)
-                       continue;
-
-               if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 0){
-                       const char *pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
-
-                       /* if no explicit page size, the default page size is compared */
-                       if (pagesz_str == NULL){
-                               if (hugepage_sz == default_size){
-                                       strlcpy(hugedir, splitstr[MOUNTPT], len);
-                                       retval = 0;
-                                       break;
-                               }
-                       }
-                       /* there is an explicit page size, so check it */
-                       else {
-                               uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
-                               if (pagesz == hugepage_sz) {
-                                       strlcpy(hugedir, splitstr[MOUNTPT], len);
-                                       retval = 0;
-                                       break;
-                               }
-                       }
-               } /* end if strncmp hugetlbfs */
-       } /* end while fgets */
-
-       fclose(fd);
-       return retval;
-}
-
-/*
- * Clear the hugepage directory of whatever hugepage files
- * there are. Checks if the file is locked (i.e.
- * if it's in use by another DPDK process).
- */
-static int
-clear_hugedir(const char * hugedir)
-{
-       DIR *dir;
-       struct dirent *dirent;
-       int dir_fd, fd, lck_result;
-       const char filter[] = "*map_*"; /* matches hugepage files */
-
-       /* open directory */
-       dir = opendir(hugedir);
-       if (!dir) {
-               RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
-                               hugedir);
-               goto error;
-       }
-       dir_fd = dirfd(dir);
-
-       dirent = readdir(dir);
-       if (!dirent) {
-               RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
-                               hugedir);
-               goto error;
-       }
-
-       while(dirent != NULL){
-               /* skip files that don't match the hugepage pattern */
-               if (fnmatch(filter, dirent->d_name, 0) > 0) {
-                       dirent = readdir(dir);
-                       continue;
-               }
-
-               /* try and lock the file */
-               fd = openat(dir_fd, dirent->d_name, O_RDONLY);
-
-               /* skip to next file */
-               if (fd == -1) {
-                       dirent = readdir(dir);
-                       continue;
-               }
-
-               /* non-blocking lock */
-               lck_result = flock(fd, LOCK_EX | LOCK_NB);
-
-               /* if lock succeeds, remove the file */
-               if (lck_result != -1)
-                       unlinkat(dir_fd, dirent->d_name, 0);
-               close (fd);
-               dirent = readdir(dir);
-       }
-
-       closedir(dir);
-       return 0;
-
-error:
-       if (dir)
-               closedir(dir);
-
-       RTE_LOG(ERR, EAL, "Error while clearing hugepage dir: %s\n",
-               strerror(errno));
-
-       return -1;
-}
-
-static int
-compare_hpi(const void *a, const void *b)
-{
-       const struct hugepage_info *hpi_a = a;
-       const struct hugepage_info *hpi_b = b;
-
-       return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
-}
-
-static void
-calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
-{
-       uint64_t total_pages = 0;
-       unsigned int i;
-
-       /*
-        * first, try to put all hugepages into relevant sockets, but
-        * if first attempts fails, fall back to collecting all pages
-        * in one socket and sorting them later
-        */
-       total_pages = 0;
-       /* we also don't want to do this for legacy init */
-       if (!internal_config.legacy_mem)
-               for (i = 0; i < rte_socket_count(); i++) {
-                       int socket = rte_socket_id_by_idx(i);
-                       unsigned int num_pages =
-                                       get_num_hugepages_on_node(
-                                               dirent->d_name, socket);
-                       hpi->num_pages[socket] = num_pages;
-                       total_pages += num_pages;
-               }
-       /*
-        * we failed to sort memory from the get go, so fall
-        * back to old way
-        */
-       if (total_pages == 0) {
-               hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
-
-#ifndef RTE_ARCH_64
-               /* for 32-bit systems, limit number of hugepages to
-                * 1GB per page size */
-               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
-                               RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
-       }
-}
-
-static int
-hugepage_info_init(void)
-{      const char dirent_start_text[] = "hugepages-";
-       const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-       unsigned int i, num_sizes = 0;
-       DIR *dir;
-       struct dirent *dirent;
-
-       dir = opendir(sys_dir_path);
-       if (dir == NULL) {
-               RTE_LOG(ERR, EAL,
-                       "Cannot open directory %s to read system hugepage info\n",
-                       sys_dir_path);
-               return -1;
-       }
-
-       for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
-               struct hugepage_info *hpi;
-
-               if (strncmp(dirent->d_name, dirent_start_text,
-                           dirent_start_len) != 0)
-                       continue;
-
-               if (num_sizes >= MAX_HUGEPAGE_SIZES)
-                       break;
-
-               hpi = &internal_config.hugepage_info[num_sizes];
-               hpi->hugepage_sz =
-                       rte_str_to_size(&dirent->d_name[dirent_start_len]);
-
-               /* first, check if we have a mountpoint */
-               if (get_hugepage_dir(hpi->hugepage_sz,
-                       hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
-                       uint32_t num_pages;
-
-                       num_pages = get_num_hugepages(dirent->d_name);
-                       if (num_pages > 0)
-                               RTE_LOG(NOTICE, EAL,
-                                       "%" PRIu32 " hugepages of size "
-                                       "%" PRIu64 " reserved, but no mounted "
-                                       "hugetlbfs found for that size\n",
-                                       num_pages, hpi->hugepage_sz);
-                       /* if we have kernel support for reserving hugepages
-                        * through mmap, and we're in in-memory mode, treat this
-                        * page size as valid. we cannot be in legacy mode at
-                        * this point because we've checked this earlier in the
-                        * init process.
-                        */
-#ifdef MAP_HUGE_SHIFT
-                       if (internal_config.in_memory) {
-                               RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
-                                       "hugepages of size %" PRIu64 " bytes "
-                                       "will be allocated anonymously\n",
-                                       hpi->hugepage_sz);
-                               calc_num_pages(hpi, dirent);
-                               num_sizes++;
-                       }
-#endif
-                       continue;
-               }
-
-               /* try to obtain a writelock */
-               hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
-
-               /* if blocking lock failed */
-               if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
-                       RTE_LOG(CRIT, EAL,
-                               "Failed to lock hugepage directory!\n");
-                       break;
-               }
-               /* clear out the hugepages dir from unused pages */
-               if (clear_hugedir(hpi->hugedir) == -1)
-                       break;
-
-               calc_num_pages(hpi, dirent);
-
-               num_sizes++;
-       }
-       closedir(dir);
-
-       /* something went wrong, and we broke from the for loop above */
-       if (dirent != NULL)
-               return -1;
-
-       internal_config.num_hugepage_sizes = num_sizes;
-
-       /* sort the page directory entries by size, largest to smallest */
-       qsort(&internal_config.hugepage_info[0], num_sizes,
-             sizeof(internal_config.hugepage_info[0]), compare_hpi);
-
-       /* now we have all info, check we have at least one valid size */
-       for (i = 0; i < num_sizes; i++) {
-               /* pages may no longer all be on socket 0, so check all */
-               unsigned int j, num_pages = 0;
-               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-
-               for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
-                       num_pages += hpi->num_pages[j];
-               if (num_pages > 0)
-                       return 0;
-       }
-
-       /* no valid hugepage mounts available, return error */
-       return -1;
-}
-
-/*
- * when we initialize the hugepage info, everything goes
- * to socket 0 by default. it will later get sorted by memory
- * initialization procedure.
- */
-int
-eal_hugepage_info_init(void)
-{
-       struct hugepage_info *hpi, *tmp_hpi;
-       unsigned int i;
-
-       if (hugepage_info_init() < 0)
-               return -1;
-
-       /* for no shared files mode, we're done */
-       if (internal_config.no_shconf)
-               return 0;
-
-       hpi = &internal_config.hugepage_info[0];
-
-       tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
-                       sizeof(internal_config.hugepage_info));
-       if (tmp_hpi == NULL) {
-               RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
-               return -1;
-       }
-
-       memcpy(tmp_hpi, hpi, sizeof(internal_config.hugepage_info));
-
-       /* we've copied file descriptors along with everything else, but they
-        * will be invalid in secondary process, so overwrite them
-        */
-       for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
-               struct hugepage_info *tmp = &tmp_hpi[i];
-               tmp->lock_descriptor = -1;
-       }
-
-       if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
-               RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
-               return -1;
-       }
-       return 0;
-}
-
-int eal_hugepage_info_read(void)
-{
-       struct hugepage_info *hpi = &internal_config.hugepage_info[0];
-       struct hugepage_info *tmp_hpi;
-
-       tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
-                                 sizeof(internal_config.hugepage_info));
-       if (tmp_hpi == NULL) {
-               RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
-               return -1;
-       }
-
-       memcpy(hpi, tmp_hpi, sizeof(internal_config.hugepage_info));
-
-       if (munmap(tmp_hpi, sizeof(internal_config.hugepage_info)) < 0) {
-               RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
-               return -1;
-       }
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
deleted file mode 100644 (file)
index cbac451..0000000
+++ /dev/null
@@ -1,1326 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <stdio.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <sys/queue.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <string.h>
-#include <errno.h>
-#include <inttypes.h>
-#include <sys/epoll.h>
-#include <sys/signalfd.h>
-#include <sys/ioctl.h>
-#include <sys/eventfd.h>
-#include <assert.h>
-#include <stdbool.h>
-
-#include <rte_common.h>
-#include <rte_interrupts.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_atomic.h>
-#include <rte_branch_prediction.h>
-#include <rte_debug.h>
-#include <rte_log.h>
-#include <rte_errno.h>
-#include <rte_spinlock.h>
-#include <rte_pause.h>
-#include <rte_vfio.h>
-
-#include "eal_private.h"
-#include "eal_vfio.h"
-#include "eal_thread.h"
-
-#define EAL_INTR_EPOLL_WAIT_FOREVER (-1)
-#define NB_OTHER_INTR               1
-
-static RTE_DEFINE_PER_LCORE(int, _epfd) = -1; /**< epoll fd per thread */
-
-/**
- * union for pipe fds.
- */
-union intr_pipefds{
-       struct {
-               int pipefd[2];
-       };
-       struct {
-               int readfd;
-               int writefd;
-       };
-};
-
-/**
- * union buffer for reading on different devices
- */
-union rte_intr_read_buffer {
-       int uio_intr_count;              /* for uio device */
-#ifdef VFIO_PRESENT
-       uint64_t vfio_intr_count;        /* for vfio device */
-#endif
-       uint64_t timerfd_num;            /* for timerfd */
-       char charbuf[16];                /* for others */
-};
-
-TAILQ_HEAD(rte_intr_cb_list, rte_intr_callback);
-TAILQ_HEAD(rte_intr_source_list, rte_intr_source);
-
-struct rte_intr_callback {
-       TAILQ_ENTRY(rte_intr_callback) next;
-       rte_intr_callback_fn cb_fn;  /**< callback address */
-       void *cb_arg;                /**< parameter for callback */
-};
-
-struct rte_intr_source {
-       TAILQ_ENTRY(rte_intr_source) next;
-       struct rte_intr_handle intr_handle; /**< interrupt handle */
-       struct rte_intr_cb_list callbacks;  /**< user callbacks */
-       uint32_t active;
-};
-
-/* global spinlock for interrupt data operation */
-static rte_spinlock_t intr_lock = RTE_SPINLOCK_INITIALIZER;
-
-/* union buffer for pipe read/write */
-static union intr_pipefds intr_pipe;
-
-/* interrupt sources list */
-static struct rte_intr_source_list intr_sources;
-
-/* interrupt handling thread */
-static pthread_t intr_thread;
-
-/* VFIO interrupts */
-#ifdef VFIO_PRESENT
-
-#define IRQ_SET_BUF_LEN  (sizeof(struct vfio_irq_set) + sizeof(int))
-/* irq set buffer length for queue interrupts and LSC interrupt */
-#define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
-                             sizeof(int) * (RTE_MAX_RXTX_INTR_VEC_ID + 1))
-
-/* enable legacy (INTx) interrupts */
-static int
-vfio_enable_intx(const struct rte_intr_handle *intr_handle) {
-       struct vfio_irq_set *irq_set;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       int len, ret;
-       int *fd_ptr;
-
-       len = sizeof(irq_set_buf);
-
-       /* enable INTx */
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-       irq_set->start = 0;
-       fd_ptr = (int *) &irq_set->data;
-       *fd_ptr = intr_handle->fd;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error enabling INTx interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-
-       /* unmask INTx after enabling */
-       memset(irq_set, 0, len);
-       len = sizeof(struct vfio_irq_set);
-       irq_set->argsz = len;
-       irq_set->count = 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK;
-       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error unmasking INTx interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-       return 0;
-}
-
-/* disable legacy (INTx) interrupts */
-static int
-vfio_disable_intx(const struct rte_intr_handle *intr_handle) {
-       struct vfio_irq_set *irq_set;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       int len, ret;
-
-       len = sizeof(struct vfio_irq_set);
-
-       /* mask interrupts before disabling */
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK;
-       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error masking INTx interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-
-       /* disable INTx*/
-       memset(irq_set, 0, len);
-       irq_set->argsz = len;
-       irq_set->count = 0;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_INTX_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL,
-                       "Error disabling INTx interrupts for fd %d\n", intr_handle->fd);
-               return -1;
-       }
-       return 0;
-}
-
-/* enable MSI interrupts */
-static int
-vfio_enable_msi(const struct rte_intr_handle *intr_handle) {
-       int len, ret;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       struct vfio_irq_set *irq_set;
-       int *fd_ptr;
-
-       len = sizeof(irq_set_buf);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-       irq_set->start = 0;
-       fd_ptr = (int *) &irq_set->data;
-       *fd_ptr = intr_handle->fd;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error enabling MSI interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-       return 0;
-}
-
-/* disable MSI interrupts */
-static int
-vfio_disable_msi(const struct rte_intr_handle *intr_handle) {
-       struct vfio_irq_set *irq_set;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       int len, ret;
-
-       len = sizeof(struct vfio_irq_set);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 0;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_MSI_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret)
-               RTE_LOG(ERR, EAL,
-                       "Error disabling MSI interrupts for fd %d\n", intr_handle->fd);
-
-       return ret;
-}
-
-/* enable MSI-X interrupts */
-static int
-vfio_enable_msix(const struct rte_intr_handle *intr_handle) {
-       int len, ret;
-       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
-       struct vfio_irq_set *irq_set;
-       int *fd_ptr;
-
-       len = sizeof(irq_set_buf);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       /* 0 < irq_set->count < RTE_MAX_RXTX_INTR_VEC_ID + 1 */
-       irq_set->count = intr_handle->max_intr ?
-               (intr_handle->max_intr > RTE_MAX_RXTX_INTR_VEC_ID + 1 ?
-               RTE_MAX_RXTX_INTR_VEC_ID + 1 : intr_handle->max_intr) : 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-       irq_set->start = 0;
-       fd_ptr = (int *) &irq_set->data;
-       /* INTR vector offset 0 reserve for non-efds mapping */
-       fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = intr_handle->fd;
-       memcpy(&fd_ptr[RTE_INTR_VEC_RXTX_OFFSET], intr_handle->efds,
-               sizeof(*intr_handle->efds) * intr_handle->nb_efd);
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error enabling MSI-X interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-/* disable MSI-X interrupts */
-static int
-vfio_disable_msix(const struct rte_intr_handle *intr_handle) {
-       struct vfio_irq_set *irq_set;
-       char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
-       int len, ret;
-
-       len = sizeof(struct vfio_irq_set);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 0;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret)
-               RTE_LOG(ERR, EAL,
-                       "Error disabling MSI-X interrupts for fd %d\n", intr_handle->fd);
-
-       return ret;
-}
-
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
-/* enable req notifier */
-static int
-vfio_enable_req(const struct rte_intr_handle *intr_handle)
-{
-       int len, ret;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       struct vfio_irq_set *irq_set;
-       int *fd_ptr;
-
-       len = sizeof(irq_set_buf);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 1;
-       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
-                        VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
-       irq_set->start = 0;
-       fd_ptr = (int *) &irq_set->data;
-       *fd_ptr = intr_handle->fd;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret) {
-               RTE_LOG(ERR, EAL, "Error enabling req interrupts for fd %d\n",
-                                               intr_handle->fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-/* disable req notifier */
-static int
-vfio_disable_req(const struct rte_intr_handle *intr_handle)
-{
-       struct vfio_irq_set *irq_set;
-       char irq_set_buf[IRQ_SET_BUF_LEN];
-       int len, ret;
-
-       len = sizeof(struct vfio_irq_set);
-
-       irq_set = (struct vfio_irq_set *) irq_set_buf;
-       irq_set->argsz = len;
-       irq_set->count = 0;
-       irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
-       irq_set->index = VFIO_PCI_REQ_IRQ_INDEX;
-       irq_set->start = 0;
-
-       ret = ioctl(intr_handle->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
-
-       if (ret)
-               RTE_LOG(ERR, EAL, "Error disabling req interrupts for fd %d\n",
-                       intr_handle->fd);
-
-       return ret;
-}
-#endif
-#endif
-
-static int
-uio_intx_intr_disable(const struct rte_intr_handle *intr_handle)
-{
-       unsigned char command_high;
-
-       /* use UIO config file descriptor for uio_pci_generic */
-       if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
-               RTE_LOG(ERR, EAL,
-                       "Error reading interrupts status for fd %d\n",
-                       intr_handle->uio_cfg_fd);
-               return -1;
-       }
-       /* disable interrupts */
-       command_high |= 0x4;
-       if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
-               RTE_LOG(ERR, EAL,
-                       "Error disabling interrupts for fd %d\n",
-                       intr_handle->uio_cfg_fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-static int
-uio_intx_intr_enable(const struct rte_intr_handle *intr_handle)
-{
-       unsigned char command_high;
-
-       /* use UIO config file descriptor for uio_pci_generic */
-       if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
-               RTE_LOG(ERR, EAL,
-                       "Error reading interrupts status for fd %d\n",
-                       intr_handle->uio_cfg_fd);
-               return -1;
-       }
-       /* enable interrupts */
-       command_high &= ~0x4;
-       if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
-               RTE_LOG(ERR, EAL,
-                       "Error enabling interrupts for fd %d\n",
-                       intr_handle->uio_cfg_fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-static int
-uio_intr_disable(const struct rte_intr_handle *intr_handle)
-{
-       const int value = 0;
-
-       if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
-               RTE_LOG(ERR, EAL,
-                       "Error disabling interrupts for fd %d (%s)\n",
-                       intr_handle->fd, strerror(errno));
-               return -1;
-       }
-       return 0;
-}
-
-static int
-uio_intr_enable(const struct rte_intr_handle *intr_handle)
-{
-       const int value = 1;
-
-       if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
-               RTE_LOG(ERR, EAL,
-                       "Error enabling interrupts for fd %d (%s)\n",
-                       intr_handle->fd, strerror(errno));
-               return -1;
-       }
-       return 0;
-}
-
-int
-rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
-                       rte_intr_callback_fn cb, void *cb_arg)
-{
-       int ret, wake_thread;
-       struct rte_intr_source *src;
-       struct rte_intr_callback *callback;
-
-       wake_thread = 0;
-
-       /* first do parameter checking */
-       if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
-               RTE_LOG(ERR, EAL,
-                       "Registering with invalid input parameter\n");
-               return -EINVAL;
-       }
-
-       /* allocate a new interrupt callback entity */
-       callback = calloc(1, sizeof(*callback));
-       if (callback == NULL) {
-               RTE_LOG(ERR, EAL, "Can not allocate memory\n");
-               return -ENOMEM;
-       }
-       callback->cb_fn = cb;
-       callback->cb_arg = cb_arg;
-
-       rte_spinlock_lock(&intr_lock);
-
-       /* check if there is at least one callback registered for the fd */
-       TAILQ_FOREACH(src, &intr_sources, next) {
-               if (src->intr_handle.fd == intr_handle->fd) {
-                       /* we had no interrupts for this */
-                       if (TAILQ_EMPTY(&src->callbacks))
-                               wake_thread = 1;
-
-                       TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
-                       ret = 0;
-                       break;
-               }
-       }
-
-       /* no existing callbacks for this - add new source */
-       if (src == NULL) {
-               src = calloc(1, sizeof(*src));
-               if (src == NULL) {
-                       RTE_LOG(ERR, EAL, "Can not allocate memory\n");
-                       free(callback);
-                       ret = -ENOMEM;
-               } else {
-                       src->intr_handle = *intr_handle;
-                       TAILQ_INIT(&src->callbacks);
-                       TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
-                       TAILQ_INSERT_TAIL(&intr_sources, src, next);
-                       wake_thread = 1;
-                       ret = 0;
-               }
-       }
-
-       rte_spinlock_unlock(&intr_lock);
-
-       /**
-        * check if need to notify the pipe fd waited by epoll_wait to
-        * rebuild the wait list.
-        */
-       if (wake_thread)
-               if (write(intr_pipe.writefd, "1", 1) < 0)
-                       return -EPIPE;
-
-       return ret;
-}
-
-int
-rte_intr_callback_unregister(const struct rte_intr_handle *intr_handle,
-                       rte_intr_callback_fn cb_fn, void *cb_arg)
-{
-       int ret;
-       struct rte_intr_source *src;
-       struct rte_intr_callback *cb, *next;
-
-       /* do parameter checking first */
-       if (intr_handle == NULL || intr_handle->fd < 0) {
-               RTE_LOG(ERR, EAL,
-               "Unregistering with invalid input parameter\n");
-               return -EINVAL;
-       }
-
-       rte_spinlock_lock(&intr_lock);
-
-       /* check if the insterrupt source for the fd is existent */
-       TAILQ_FOREACH(src, &intr_sources, next)
-               if (src->intr_handle.fd == intr_handle->fd)
-                       break;
-
-       /* No interrupt source registered for the fd */
-       if (src == NULL) {
-               ret = -ENOENT;
-
-       /* interrupt source has some active callbacks right now. */
-       } else if (src->active != 0) {
-               ret = -EAGAIN;
-
-       /* ok to remove. */
-       } else {
-               ret = 0;
-
-               /*walk through the callbacks and remove all that match. */
-               for (cb = TAILQ_FIRST(&src->callbacks); cb != NULL; cb = next) {
-
-                       next = TAILQ_NEXT(cb, next);
-
-                       if (cb->cb_fn == cb_fn && (cb_arg == (void *)-1 ||
-                                       cb->cb_arg == cb_arg)) {
-                               TAILQ_REMOVE(&src->callbacks, cb, next);
-                               free(cb);
-                               ret++;
-                       }
-               }
-
-               /* all callbacks for that source are removed. */
-               if (TAILQ_EMPTY(&src->callbacks)) {
-                       TAILQ_REMOVE(&intr_sources, src, next);
-                       free(src);
-               }
-       }
-
-       rte_spinlock_unlock(&intr_lock);
-
-       /* notify the pipe fd waited by epoll_wait to rebuild the wait list */
-       if (ret >= 0 && write(intr_pipe.writefd, "1", 1) < 0) {
-               ret = -EPIPE;
-       }
-
-       return ret;
-}
-
-int
-rte_intr_enable(const struct rte_intr_handle *intr_handle)
-{
-       if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
-               return 0;
-
-       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
-               return -1;
-
-       switch (intr_handle->type){
-       /* write to the uio fd to enable the interrupt */
-       case RTE_INTR_HANDLE_UIO:
-               if (uio_intr_enable(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_UIO_INTX:
-               if (uio_intx_intr_enable(intr_handle))
-                       return -1;
-               break;
-       /* not used at this moment */
-       case RTE_INTR_HANDLE_ALARM:
-               return -1;
-#ifdef VFIO_PRESENT
-       case RTE_INTR_HANDLE_VFIO_MSIX:
-               if (vfio_enable_msix(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_VFIO_MSI:
-               if (vfio_enable_msi(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_VFIO_LEGACY:
-               if (vfio_enable_intx(intr_handle))
-                       return -1;
-               break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
-       case RTE_INTR_HANDLE_VFIO_REQ:
-               if (vfio_enable_req(intr_handle))
-                       return -1;
-               break;
-#endif
-#endif
-       /* not used at this moment */
-       case RTE_INTR_HANDLE_DEV_EVENT:
-               return -1;
-       /* unknown handle type */
-       default:
-               RTE_LOG(ERR, EAL,
-                       "Unknown handle type of fd %d\n",
-                                       intr_handle->fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-int
-rte_intr_disable(const struct rte_intr_handle *intr_handle)
-{
-       if (intr_handle && intr_handle->type == RTE_INTR_HANDLE_VDEV)
-               return 0;
-
-       if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd < 0)
-               return -1;
-
-       switch (intr_handle->type){
-       /* write to the uio fd to disable the interrupt */
-       case RTE_INTR_HANDLE_UIO:
-               if (uio_intr_disable(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_UIO_INTX:
-               if (uio_intx_intr_disable(intr_handle))
-                       return -1;
-               break;
-       /* not used at this moment */
-       case RTE_INTR_HANDLE_ALARM:
-               return -1;
-#ifdef VFIO_PRESENT
-       case RTE_INTR_HANDLE_VFIO_MSIX:
-               if (vfio_disable_msix(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_VFIO_MSI:
-               if (vfio_disable_msi(intr_handle))
-                       return -1;
-               break;
-       case RTE_INTR_HANDLE_VFIO_LEGACY:
-               if (vfio_disable_intx(intr_handle))
-                       return -1;
-               break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
-       case RTE_INTR_HANDLE_VFIO_REQ:
-               if (vfio_disable_req(intr_handle))
-                       return -1;
-               break;
-#endif
-#endif
-       /* not used at this moment */
-       case RTE_INTR_HANDLE_DEV_EVENT:
-               return -1;
-       /* unknown handle type */
-       default:
-               RTE_LOG(ERR, EAL,
-                       "Unknown handle type of fd %d\n",
-                                       intr_handle->fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-static int
-eal_intr_process_interrupts(struct epoll_event *events, int nfds)
-{
-       bool call = false;
-       int n, bytes_read;
-       struct rte_intr_source *src;
-       struct rte_intr_callback *cb, *next;
-       union rte_intr_read_buffer buf;
-       struct rte_intr_callback active_cb;
-
-       for (n = 0; n < nfds; n++) {
-
-               /**
-                * if the pipe fd is ready to read, return out to
-                * rebuild the wait list.
-                */
-               if (events[n].data.fd == intr_pipe.readfd){
-                       int r = read(intr_pipe.readfd, buf.charbuf,
-                                       sizeof(buf.charbuf));
-                       RTE_SET_USED(r);
-                       return -1;
-               }
-               rte_spinlock_lock(&intr_lock);
-               TAILQ_FOREACH(src, &intr_sources, next)
-                       if (src->intr_handle.fd ==
-                                       events[n].data.fd)
-                               break;
-               if (src == NULL){
-                       rte_spinlock_unlock(&intr_lock);
-                       continue;
-               }
-
-               /* mark this interrupt source as active and release the lock. */
-               src->active = 1;
-               rte_spinlock_unlock(&intr_lock);
-
-               /* set the length to be read dor different handle type */
-               switch (src->intr_handle.type) {
-               case RTE_INTR_HANDLE_UIO:
-               case RTE_INTR_HANDLE_UIO_INTX:
-                       bytes_read = sizeof(buf.uio_intr_count);
-                       break;
-               case RTE_INTR_HANDLE_ALARM:
-                       bytes_read = sizeof(buf.timerfd_num);
-                       break;
-#ifdef VFIO_PRESENT
-               case RTE_INTR_HANDLE_VFIO_MSIX:
-               case RTE_INTR_HANDLE_VFIO_MSI:
-               case RTE_INTR_HANDLE_VFIO_LEGACY:
-                       bytes_read = sizeof(buf.vfio_intr_count);
-                       break;
-#ifdef HAVE_VFIO_DEV_REQ_INTERFACE
-               case RTE_INTR_HANDLE_VFIO_REQ:
-                       bytes_read = 0;
-                       call = true;
-                       break;
-#endif
-#endif
-               case RTE_INTR_HANDLE_VDEV:
-               case RTE_INTR_HANDLE_EXT:
-                       bytes_read = 0;
-                       call = true;
-                       break;
-               case RTE_INTR_HANDLE_DEV_EVENT:
-                       bytes_read = 0;
-                       call = true;
-                       break;
-               default:
-                       bytes_read = 1;
-                       break;
-               }
-
-               if (bytes_read > 0) {
-                       /**
-                        * read out to clear the ready-to-be-read flag
-                        * for epoll_wait.
-                        */
-                       bytes_read = read(events[n].data.fd, &buf, bytes_read);
-                       if (bytes_read < 0) {
-                               if (errno == EINTR || errno == EWOULDBLOCK)
-                                       continue;
-
-                               RTE_LOG(ERR, EAL, "Error reading from file "
-                                       "descriptor %d: %s\n",
-                                       events[n].data.fd,
-                                       strerror(errno));
-                               /*
-                                * The device is unplugged or buggy, remove
-                                * it as an interrupt source and return to
-                                * force the wait list to be rebuilt.
-                                */
-                               rte_spinlock_lock(&intr_lock);
-                               TAILQ_REMOVE(&intr_sources, src, next);
-                               rte_spinlock_unlock(&intr_lock);
-
-                               for (cb = TAILQ_FIRST(&src->callbacks); cb;
-                                                       cb = next) {
-                                       next = TAILQ_NEXT(cb, next);
-                                       TAILQ_REMOVE(&src->callbacks, cb, next);
-                                       free(cb);
-                               }
-                               free(src);
-                               return -1;
-                       } else if (bytes_read == 0)
-                               RTE_LOG(ERR, EAL, "Read nothing from file "
-                                       "descriptor %d\n", events[n].data.fd);
-                       else
-                               call = true;
-               }
-
-               /* grab a lock, again to call callbacks and update status. */
-               rte_spinlock_lock(&intr_lock);
-
-               if (call) {
-
-                       /* Finally, call all callbacks. */
-                       TAILQ_FOREACH(cb, &src->callbacks, next) {
-
-                               /* make a copy and unlock. */
-                               active_cb = *cb;
-                               rte_spinlock_unlock(&intr_lock);
-
-                               /* call the actual callback */
-                               active_cb.cb_fn(active_cb.cb_arg);
-
-                               /*get the lock back. */
-                               rte_spinlock_lock(&intr_lock);
-                       }
-               }
-
-               /* we done with that interrupt source, release it. */
-               src->active = 0;
-               rte_spinlock_unlock(&intr_lock);
-       }
-
-       return 0;
-}
-
-/**
- * It handles all the interrupts.
- *
- * @param pfd
- *  epoll file descriptor.
- * @param totalfds
- *  The number of file descriptors added in epoll.
- *
- * @return
- *  void
- */
-static void
-eal_intr_handle_interrupts(int pfd, unsigned totalfds)
-{
-       struct epoll_event events[totalfds];
-       int nfds = 0;
-
-       for(;;) {
-               nfds = epoll_wait(pfd, events, totalfds,
-                       EAL_INTR_EPOLL_WAIT_FOREVER);
-               /* epoll_wait fail */
-               if (nfds < 0) {
-                       if (errno == EINTR)
-                               continue;
-                       RTE_LOG(ERR, EAL,
-                               "epoll_wait returns with fail\n");
-                       return;
-               }
-               /* epoll_wait timeout, will never happens here */
-               else if (nfds == 0)
-                       continue;
-               /* epoll_wait has at least one fd ready to read */
-               if (eal_intr_process_interrupts(events, nfds) < 0)
-                       return;
-       }
-}
-
-/**
- * It builds/rebuilds up the epoll file descriptor with all the
- * file descriptors being waited on. Then handles the interrupts.
- *
- * @param arg
- *  pointer. (unused)
- *
- * @return
- *  never return;
- */
-static __attribute__((noreturn)) void *
-eal_intr_thread_main(__rte_unused void *arg)
-{
-       struct epoll_event ev;
-
-       /* host thread, never break out */
-       for (;;) {
-               /* build up the epoll fd with all descriptors we are to
-                * wait on then pass it to the handle_interrupts function
-                */
-               static struct epoll_event pipe_event = {
-                       .events = EPOLLIN | EPOLLPRI,
-               };
-               struct rte_intr_source *src;
-               unsigned numfds = 0;
-
-               /* create epoll fd */
-               int pfd = epoll_create(1);
-               if (pfd < 0)
-                       rte_panic("Cannot create epoll instance\n");
-
-               pipe_event.data.fd = intr_pipe.readfd;
-               /**
-                * add pipe fd into wait list, this pipe is used to
-                * rebuild the wait list.
-                */
-               if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
-                                               &pipe_event) < 0) {
-                       rte_panic("Error adding fd to %d epoll_ctl, %s\n",
-                                       intr_pipe.readfd, strerror(errno));
-               }
-               numfds++;
-
-               rte_spinlock_lock(&intr_lock);
-
-               TAILQ_FOREACH(src, &intr_sources, next) {
-                       if (src->callbacks.tqh_first == NULL)
-                               continue; /* skip those with no callbacks */
-                       ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
-                       ev.data.fd = src->intr_handle.fd;
-
-                       /**
-                        * add all the uio device file descriptor
-                        * into wait list.
-                        */
-                       if (epoll_ctl(pfd, EPOLL_CTL_ADD,
-                                       src->intr_handle.fd, &ev) < 0){
-                               rte_panic("Error adding fd %d epoll_ctl, %s\n",
-                                       src->intr_handle.fd, strerror(errno));
-                       }
-                       else
-                               numfds++;
-               }
-               rte_spinlock_unlock(&intr_lock);
-               /* serve the interrupt */
-               eal_intr_handle_interrupts(pfd, numfds);
-
-               /**
-                * when we return, we need to rebuild the
-                * list of fds to monitor.
-                */
-               close(pfd);
-       }
-}
-
-int
-rte_eal_intr_init(void)
-{
-       int ret = 0;
-
-       /* init the global interrupt source head */
-       TAILQ_INIT(&intr_sources);
-
-       /**
-        * create a pipe which will be waited by epoll and notified to
-        * rebuild the wait list of epoll.
-        */
-       if (pipe(intr_pipe.pipefd) < 0) {
-               rte_errno = errno;
-               return -1;
-       }
-
-       /* create the host thread to wait/handle the interrupt */
-       ret = rte_ctrl_thread_create(&intr_thread, "eal-intr-thread", NULL,
-                       eal_intr_thread_main, NULL);
-       if (ret != 0) {
-               rte_errno = -ret;
-               RTE_LOG(ERR, EAL,
-                       "Failed to create thread for interrupt handling\n");
-       }
-
-       return ret;
-}
-
-static void
-eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
-{
-       union rte_intr_read_buffer buf;
-       int bytes_read = 0;
-       int nbytes;
-
-       switch (intr_handle->type) {
-       case RTE_INTR_HANDLE_UIO:
-       case RTE_INTR_HANDLE_UIO_INTX:
-               bytes_read = sizeof(buf.uio_intr_count);
-               break;
-#ifdef VFIO_PRESENT
-       case RTE_INTR_HANDLE_VFIO_MSIX:
-       case RTE_INTR_HANDLE_VFIO_MSI:
-       case RTE_INTR_HANDLE_VFIO_LEGACY:
-               bytes_read = sizeof(buf.vfio_intr_count);
-               break;
-#endif
-       case RTE_INTR_HANDLE_VDEV:
-               bytes_read = intr_handle->efd_counter_size;
-               /* For vdev, number of bytes to read is set by driver */
-               break;
-       case RTE_INTR_HANDLE_EXT:
-               return;
-       default:
-               bytes_read = 1;
-               RTE_LOG(INFO, EAL, "unexpected intr type\n");
-               break;
-       }
-
-       /**
-        * read out to clear the ready-to-be-read flag
-        * for epoll_wait.
-        */
-       if (bytes_read == 0)
-               return;
-       do {
-               nbytes = read(fd, &buf, bytes_read);
-               if (nbytes < 0) {
-                       if (errno == EINTR || errno == EWOULDBLOCK ||
-                           errno == EAGAIN)
-                               continue;
-                       RTE_LOG(ERR, EAL,
-                               "Error reading from fd %d: %s\n",
-                               fd, strerror(errno));
-               } else if (nbytes == 0)
-                       RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
-               return;
-       } while (1);
-}
-
-static int
-eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
-                       struct rte_epoll_event *events)
-{
-       unsigned int i, count = 0;
-       struct rte_epoll_event *rev;
-
-       for (i = 0; i < n; i++) {
-               rev = evs[i].data.ptr;
-               if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
-                                                RTE_EPOLL_EXEC))
-                       continue;
-
-               events[count].status        = RTE_EPOLL_VALID;
-               events[count].fd            = rev->fd;
-               events[count].epfd          = rev->epfd;
-               events[count].epdata.event  = rev->epdata.event;
-               events[count].epdata.data   = rev->epdata.data;
-               if (rev->epdata.cb_fun)
-                       rev->epdata.cb_fun(rev->fd,
-                                          rev->epdata.cb_arg);
-
-               rte_compiler_barrier();
-               rev->status = RTE_EPOLL_VALID;
-               count++;
-       }
-       return count;
-}
-
-static inline int
-eal_init_tls_epfd(void)
-{
-       int pfd = epoll_create(255);
-
-       if (pfd < 0) {
-               RTE_LOG(ERR, EAL,
-                       "Cannot create epoll instance\n");
-               return -1;
-       }
-       return pfd;
-}
-
-int
-rte_intr_tls_epfd(void)
-{
-       if (RTE_PER_LCORE(_epfd) == -1)
-               RTE_PER_LCORE(_epfd) = eal_init_tls_epfd();
-
-       return RTE_PER_LCORE(_epfd);
-}
-
-int
-rte_epoll_wait(int epfd, struct rte_epoll_event *events,
-              int maxevents, int timeout)
-{
-       struct epoll_event evs[maxevents];
-       int rc;
-
-       if (!events) {
-               RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
-               return -1;
-       }
-
-       /* using per thread epoll fd */
-       if (epfd == RTE_EPOLL_PER_THREAD)
-               epfd = rte_intr_tls_epfd();
-
-       while (1) {
-               rc = epoll_wait(epfd, evs, maxevents, timeout);
-               if (likely(rc > 0)) {
-                       /* epoll_wait has at least one fd ready to read */
-                       rc = eal_epoll_process_event(evs, rc, events);
-                       break;
-               } else if (rc < 0) {
-                       if (errno == EINTR)
-                               continue;
-                       /* epoll_wait fail */
-                       RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
-                               strerror(errno));
-                       rc = -1;
-                       break;
-               } else {
-                       /* rc == 0, epoll_wait timed out */
-                       break;
-               }
-       }
-
-       return rc;
-}
-
-static inline void
-eal_epoll_data_safe_free(struct rte_epoll_event *ev)
-{
-       while (!rte_atomic32_cmpset(&ev->status, RTE_EPOLL_VALID,
-                                   RTE_EPOLL_INVALID))
-               while (ev->status != RTE_EPOLL_VALID)
-                       rte_pause();
-       memset(&ev->epdata, 0, sizeof(ev->epdata));
-       ev->fd = -1;
-       ev->epfd = -1;
-}
-
-int
-rte_epoll_ctl(int epfd, int op, int fd,
-             struct rte_epoll_event *event)
-{
-       struct epoll_event ev;
-
-       if (!event) {
-               RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
-               return -1;
-       }
-
-       /* using per thread epoll fd */
-       if (epfd == RTE_EPOLL_PER_THREAD)
-               epfd = rte_intr_tls_epfd();
-
-       if (op == EPOLL_CTL_ADD) {
-               event->status = RTE_EPOLL_VALID;
-               event->fd = fd;  /* ignore fd in event */
-               event->epfd = epfd;
-               ev.data.ptr = (void *)event;
-       }
-
-       ev.events = event->epdata.event;
-       if (epoll_ctl(epfd, op, fd, &ev) < 0) {
-               RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
-                       op, fd, strerror(errno));
-               if (op == EPOLL_CTL_ADD)
-                       /* rollback status when CTL_ADD fail */
-                       event->status = RTE_EPOLL_INVALID;
-               return -1;
-       }
-
-       if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
-               eal_epoll_data_safe_free(event);
-
-       return 0;
-}
-
-int
-rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
-               int op, unsigned int vec, void *data)
-{
-       struct rte_epoll_event *rev;
-       struct rte_epoll_data *epdata;
-       int epfd_op;
-       unsigned int efd_idx;
-       int rc = 0;
-
-       efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
-               (vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
-
-       if (!intr_handle || intr_handle->nb_efd == 0 ||
-           efd_idx >= intr_handle->nb_efd) {
-               RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
-               return -EPERM;
-       }
-
-       switch (op) {
-       case RTE_INTR_EVENT_ADD:
-               epfd_op = EPOLL_CTL_ADD;
-               rev = &intr_handle->elist[efd_idx];
-               if (rev->status != RTE_EPOLL_INVALID) {
-                       RTE_LOG(INFO, EAL, "Event already been added.\n");
-                       return -EEXIST;
-               }
-
-               /* attach to intr vector fd */
-               epdata = &rev->epdata;
-               epdata->event  = EPOLLIN | EPOLLPRI | EPOLLET;
-               epdata->data   = data;
-               epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
-               epdata->cb_arg = (void *)intr_handle;
-               rc = rte_epoll_ctl(epfd, epfd_op,
-                                  intr_handle->efds[efd_idx], rev);
-               if (!rc)
-                       RTE_LOG(DEBUG, EAL,
-                               "efd %d associated with vec %d added on epfd %d"
-                               "\n", rev->fd, vec, epfd);
-               else
-                       rc = -EPERM;
-               break;
-       case RTE_INTR_EVENT_DEL:
-               epfd_op = EPOLL_CTL_DEL;
-               rev = &intr_handle->elist[efd_idx];
-               if (rev->status == RTE_EPOLL_INVALID) {
-                       RTE_LOG(INFO, EAL, "Event does not exist.\n");
-                       return -EPERM;
-               }
-
-               rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
-               if (rc)
-                       rc = -EPERM;
-               break;
-       default:
-               RTE_LOG(ERR, EAL, "event op type mismatch\n");
-               rc = -EPERM;
-       }
-
-       return rc;
-}
-
-void
-rte_intr_free_epoll_fd(struct rte_intr_handle *intr_handle)
-{
-       uint32_t i;
-       struct rte_epoll_event *rev;
-
-       for (i = 0; i < intr_handle->nb_efd; i++) {
-               rev = &intr_handle->elist[i];
-               if (rev->status == RTE_EPOLL_INVALID)
-                       continue;
-               if (rte_epoll_ctl(rev->epfd, EPOLL_CTL_DEL, rev->fd, rev)) {
-                       /* force free if the entry valid */
-                       eal_epoll_data_safe_free(rev);
-                       rev->status = RTE_EPOLL_INVALID;
-               }
-       }
-}
-
-int
-rte_intr_efd_enable(struct rte_intr_handle *intr_handle, uint32_t nb_efd)
-{
-       uint32_t i;
-       int fd;
-       uint32_t n = RTE_MIN(nb_efd, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
-
-       assert(nb_efd != 0);
-
-       if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX) {
-               for (i = 0; i < n; i++) {
-                       fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
-                       if (fd < 0) {
-                               RTE_LOG(ERR, EAL,
-                                       "can't setup eventfd, error %i (%s)\n",
-                                       errno, strerror(errno));
-                               return -errno;
-                       }
-                       intr_handle->efds[i] = fd;
-               }
-               intr_handle->nb_efd   = n;
-               intr_handle->max_intr = NB_OTHER_INTR + n;
-       } else if (intr_handle->type == RTE_INTR_HANDLE_VDEV) {
-               /* only check, initialization would be done in vdev driver.*/
-               if (intr_handle->efd_counter_size >
-                   sizeof(union rte_intr_read_buffer)) {
-                       RTE_LOG(ERR, EAL, "the efd_counter_size is oversized");
-                       return -EINVAL;
-               }
-       } else {
-               intr_handle->efds[0]  = intr_handle->fd;
-               intr_handle->nb_efd   = RTE_MIN(nb_efd, 1U);
-               intr_handle->max_intr = NB_OTHER_INTR;
-       }
-
-       return 0;
-}
-
-void
-rte_intr_efd_disable(struct rte_intr_handle *intr_handle)
-{
-       uint32_t i;
-
-       rte_intr_free_epoll_fd(intr_handle);
-       if (intr_handle->max_intr > intr_handle->nb_efd) {
-               for (i = 0; i < intr_handle->nb_efd; i++)
-                       close(intr_handle->efds[i]);
-       }
-       intr_handle->nb_efd = 0;
-       intr_handle->max_intr = 0;
-}
-
-int
-rte_intr_dp_is_en(struct rte_intr_handle *intr_handle)
-{
-       return !(!intr_handle->nb_efd);
-}
-
-int
-rte_intr_allow_others(struct rte_intr_handle *intr_handle)
-{
-       if (!rte_intr_dp_is_en(intr_handle))
-               return 1;
-       else
-               return !!(intr_handle->max_intr - intr_handle->nb_efd);
-}
-
-int
-rte_intr_cap_multiple(struct rte_intr_handle *intr_handle)
-{
-       if (intr_handle->type == RTE_INTR_HANDLE_VFIO_MSIX)
-               return 1;
-
-       if (intr_handle->type == RTE_INTR_HANDLE_VDEV)
-               return 1;
-
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_lcore.c b/lib/librte_eal/linuxapp/eal/eal_lcore.c
deleted file mode 100644 (file)
index bc89658..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <unistd.h>
-#include <limits.h>
-#include <string.h>
-#include <dirent.h>
-
-#include <rte_log.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-#include <rte_common.h>
-#include <rte_string_fns.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_filesystem.h"
-#include "eal_thread.h"
-
-#define SYS_CPU_DIR "/sys/devices/system/cpu/cpu%u"
-#define CORE_ID_FILE "topology/core_id"
-#define NUMA_NODE_PATH "/sys/devices/system/node"
-
-/* Check if a cpu is present by the presence of the cpu information for it */
-int
-eal_cpu_detected(unsigned lcore_id)
-{
-       char path[PATH_MAX];
-       int len = snprintf(path, sizeof(path), SYS_CPU_DIR
-               "/"CORE_ID_FILE, lcore_id);
-       if (len <= 0 || (unsigned)len >= sizeof(path))
-               return 0;
-       if (access(path, F_OK) != 0)
-               return 0;
-
-       return 1;
-}
-
-/*
- * Get CPU socket id (NUMA node) for a logical core.
- *
- * This searches each nodeX directories in /sys for the symlink for the given
- * lcore_id and returns the numa node where the lcore is found. If lcore is not
- * found on any numa node, returns zero.
- */
-unsigned
-eal_cpu_socket_id(unsigned lcore_id)
-{
-       unsigned socket;
-
-       for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
-               char path[PATH_MAX];
-
-               snprintf(path, sizeof(path), "%s/node%u/cpu%u", NUMA_NODE_PATH,
-                               socket, lcore_id);
-               if (access(path, F_OK) == 0)
-                       return socket;
-       }
-       return 0;
-}
-
-/* Get the cpu core id value from the /sys/.../cpuX core_id value */
-unsigned
-eal_cpu_core_id(unsigned lcore_id)
-{
-       char path[PATH_MAX];
-       unsigned long id;
-
-       int len = snprintf(path, sizeof(path), SYS_CPU_DIR "/%s", lcore_id, CORE_ID_FILE);
-       if (len <= 0 || (unsigned)len >= sizeof(path))
-               goto err;
-       if (eal_parse_sysfs_value(path, &id) != 0)
-               goto err;
-       return (unsigned)id;
-
-err:
-       RTE_LOG(ERR, EAL, "Error reading core id value from %s "
-                       "for lcore %u - assuming core 0\n", SYS_CPU_DIR, lcore_id);
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_log.c b/lib/librte_eal/linuxapp/eal/eal_log.c
deleted file mode 100644 (file)
index 9d02ddd..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <string.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <sys/types.h>
-#include <syslog.h>
-#include <sys/queue.h>
-
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_launch.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_spinlock.h>
-#include <rte_log.h>
-
-#include "eal_private.h"
-
-/*
- * default log function
- */
-static ssize_t
-console_log_write(__attribute__((unused)) void *c, const char *buf, size_t size)
-{
-       ssize_t ret;
-
-       /* write on stdout */
-       ret = fwrite(buf, 1, size, stdout);
-       fflush(stdout);
-
-       /* Syslog error levels are from 0 to 7, so subtract 1 to convert */
-       syslog(rte_log_cur_msg_loglevel() - 1, "%.*s", (int)size, buf);
-
-       return ret;
-}
-
-static cookie_io_functions_t console_log_func = {
-       .write = console_log_write,
-};
-
-/*
- * set the log to default function, called during eal init process,
- * once memzones are available.
- */
-int
-rte_eal_log_init(const char *id, int facility)
-{
-       FILE *log_stream;
-
-       log_stream = fopencookie(NULL, "w+", console_log_func);
-       if (log_stream == NULL)
-               return -1;
-
-       openlog(id, LOG_NDELAY | LOG_PID, facility);
-
-       eal_log_set_default(log_stream);
-
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
deleted file mode 100644 (file)
index b6fb183..0000000
+++ /dev/null
@@ -1,1685 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2017-2018 Intel Corporation
- */
-
-#define _FILE_OFFSET_BITS 64
-#include <errno.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/queue.h>
-#include <sys/file.h>
-#include <unistd.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <signal.h>
-#include <setjmp.h>
-#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
-#include <linux/memfd.h>
-#define MEMFD_SUPPORTED
-#endif
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-#include <numa.h>
-#include <numaif.h>
-#endif
-#include <linux/falloc.h>
-#include <linux/mman.h> /* for hugetlb-related mmap flags */
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_eal_memconfig.h>
-#include <rte_eal.h>
-#include <rte_errno.h>
-#include <rte_memory.h>
-#include <rte_spinlock.h>
-
-#include "eal_filesystem.h"
-#include "eal_internal_cfg.h"
-#include "eal_memalloc.h"
-#include "eal_private.h"
-
-const int anonymous_hugepages_supported =
-#ifdef MAP_HUGE_SHIFT
-               1;
-#define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
-#else
-               0;
-#define RTE_MAP_HUGE_SHIFT 26
-#endif
-
-/*
- * we've already checked memfd support at compile-time, but we also need to
- * check if we can create hugepage files with memfd.
- *
- * also, this is not a constant, because while we may be *compiled* with memfd
- * hugetlbfs support, we might not be *running* on a system that supports memfd
- * and/or memfd with hugetlbfs, so we need to be able to adjust this flag at
- * runtime, and fall back to anonymous memory.
- */
-static int memfd_create_supported =
-#ifdef MFD_HUGETLB
-               1;
-#define RTE_MFD_HUGETLB MFD_HUGETLB
-#else
-               0;
-#define RTE_MFD_HUGETLB 4U
-#endif
-
-/*
- * not all kernel version support fallocate on hugetlbfs, so fall back to
- * ftruncate and disallow deallocation if fallocate is not supported.
- */
-static int fallocate_supported = -1; /* unknown */
-
-/*
- * we have two modes - single file segments, and file-per-page mode.
- *
- * for single-file segments, we need some kind of mechanism to keep track of
- * which hugepages can be freed back to the system, and which cannot. we cannot
- * use flock() because they don't allow locking parts of a file, and we cannot
- * use fcntl() due to issues with their semantics, so we will have to rely on a
- * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
- * of per-page lockfiles. we will store the actual segment list fd in the
- * 'memseg_list_fd' field.
- *
- * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
- * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
- *
- * we cannot know how many pages a system will have in advance, but we do know
- * that they come in lists, and we know lengths of these lists. so, simply store
- * a malloc'd array of fd's indexed by list and segment index.
- *
- * they will be initialized at startup, and filled as we allocate/deallocate
- * segments.
- */
-static struct {
-       int *fds; /**< dynamically allocated array of segment lock fd's */
-       int memseg_list_fd; /**< memseg list fd */
-       int len; /**< total length of the array */
-       int count; /**< entries used in an array */
-} fd_list[RTE_MAX_MEMSEG_LISTS];
-
-/** local copy of a memory map, used to synchronize memory hotplug in MP */
-static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
-
-static sigjmp_buf huge_jmpenv;
-
-static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
-{
-       siglongjmp(huge_jmpenv, 1);
-}
-
-/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
- * non-static local variable in the stack frame calling sigsetjmp might be
- * clobbered by a call to longjmp.
- */
-static int __rte_unused huge_wrap_sigsetjmp(void)
-{
-       return sigsetjmp(huge_jmpenv, 1);
-}
-
-static struct sigaction huge_action_old;
-static int huge_need_recover;
-
-static void __rte_unused
-huge_register_sigbus(void)
-{
-       sigset_t mask;
-       struct sigaction action;
-
-       sigemptyset(&mask);
-       sigaddset(&mask, SIGBUS);
-       action.sa_flags = 0;
-       action.sa_mask = mask;
-       action.sa_handler = huge_sigbus_handler;
-
-       huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
-}
-
-static void __rte_unused
-huge_recover_sigbus(void)
-{
-       if (huge_need_recover) {
-               sigaction(SIGBUS, &huge_action_old, NULL);
-               huge_need_recover = 0;
-       }
-}
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-static bool
-check_numa(void)
-{
-       bool ret = true;
-       /* Check if kernel supports NUMA. */
-       if (numa_available() != 0) {
-               RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
-               ret = false;
-       }
-       return ret;
-}
-
-static void
-prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
-{
-       RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
-       if (get_mempolicy(oldpolicy, oldmask->maskp,
-                         oldmask->size + 1, 0, 0) < 0) {
-               RTE_LOG(ERR, EAL,
-                       "Failed to get current mempolicy: %s. "
-                       "Assuming MPOL_DEFAULT.\n", strerror(errno));
-               *oldpolicy = MPOL_DEFAULT;
-       }
-       RTE_LOG(DEBUG, EAL,
-               "Setting policy MPOL_PREFERRED for socket %d\n",
-               socket_id);
-       numa_set_preferred(socket_id);
-}
-
-static void
-restore_numa(int *oldpolicy, struct bitmask *oldmask)
-{
-       RTE_LOG(DEBUG, EAL,
-               "Restoring previous memory policy: %d\n", *oldpolicy);
-       if (*oldpolicy == MPOL_DEFAULT) {
-               numa_set_localalloc();
-       } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
-                                oldmask->size + 1) < 0) {
-               RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
-                       strerror(errno));
-               numa_set_localalloc();
-       }
-       numa_free_cpumask(oldmask);
-}
-#endif
-
-/*
- * uses fstat to report the size of a file on disk
- */
-static off_t
-get_file_size(int fd)
-{
-       struct stat st;
-       if (fstat(fd, &st) < 0)
-               return 0;
-       return st.st_size;
-}
-
-static int
-pagesz_flags(uint64_t page_sz)
-{
-       /* as per mmap() manpage, all page sizes are log2 of page size
-        * shifted by MAP_HUGE_SHIFT
-        */
-       int log2 = rte_log2_u64(page_sz);
-       return log2 << RTE_MAP_HUGE_SHIFT;
-}
-
-/* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
-static int lock(int fd, int type)
-{
-       int ret;
-
-       /* flock may be interrupted */
-       do {
-               ret = flock(fd, type | LOCK_NB);
-       } while (ret && errno == EINTR);
-
-       if (ret && errno == EWOULDBLOCK) {
-               /* couldn't lock */
-               return 0;
-       } else if (ret) {
-               RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
-                       __func__, strerror(errno));
-               return -1;
-       }
-       /* lock was successful */
-       return 1;
-}
-
-static int get_segment_lock_fd(int list_idx, int seg_idx)
-{
-       char path[PATH_MAX] = {0};
-       int fd;
-
-       if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
-               return -1;
-       if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
-               return -1;
-
-       fd = fd_list[list_idx].fds[seg_idx];
-       /* does this lock already exist? */
-       if (fd >= 0)
-               return fd;
-
-       eal_get_hugefile_lock_path(path, sizeof(path),
-                       list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
-
-       fd = open(path, O_CREAT | O_RDWR, 0660);
-       if (fd < 0) {
-               RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n",
-                       __func__, path, strerror(errno));
-               return -1;
-       }
-       /* take out a read lock */
-       if (lock(fd, LOCK_SH) != 1) {
-               RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n",
-                       __func__, path, strerror(errno));
-               close(fd);
-               return -1;
-       }
-       /* store it for future reference */
-       fd_list[list_idx].fds[seg_idx] = fd;
-       fd_list[list_idx].count++;
-       return fd;
-}
-
-static int unlock_segment(int list_idx, int seg_idx)
-{
-       int fd, ret;
-
-       if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
-               return -1;
-       if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
-               return -1;
-
-       fd = fd_list[list_idx].fds[seg_idx];
-
-       /* upgrade lock to exclusive to see if we can remove the lockfile */
-       ret = lock(fd, LOCK_EX);
-       if (ret == 1) {
-               /* we've succeeded in taking exclusive lock, this lockfile may
-                * be removed.
-                */
-               char path[PATH_MAX] = {0};
-               eal_get_hugefile_lock_path(path, sizeof(path),
-                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
-               if (unlink(path)) {
-                       RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n",
-                                       __func__, path, strerror(errno));
-               }
-       }
-       /* we don't want to leak the fd, so even if we fail to lock, close fd
-        * and remove it from list anyway.
-        */
-       close(fd);
-       fd_list[list_idx].fds[seg_idx] = -1;
-       fd_list[list_idx].count--;
-
-       if (ret < 0)
-               return -1;
-       return 0;
-}
-
-static int
-get_seg_memfd(struct hugepage_info *hi __rte_unused,
-               unsigned int list_idx __rte_unused,
-               unsigned int seg_idx __rte_unused)
-{
-#ifdef MEMFD_SUPPORTED
-       int fd;
-       char segname[250]; /* as per manpage, limit is 249 bytes plus null */
-
-       int flags = RTE_MFD_HUGETLB | pagesz_flags(hi->hugepage_sz);
-
-       if (internal_config.single_file_segments) {
-               fd = fd_list[list_idx].memseg_list_fd;
-
-               if (fd < 0) {
-                       snprintf(segname, sizeof(segname), "seg_%i", list_idx);
-                       fd = memfd_create(segname, flags);
-                       if (fd < 0) {
-                               RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
-                                       __func__, strerror(errno));
-                               return -1;
-                       }
-                       fd_list[list_idx].memseg_list_fd = fd;
-               }
-       } else {
-               fd = fd_list[list_idx].fds[seg_idx];
-
-               if (fd < 0) {
-                       snprintf(segname, sizeof(segname), "seg_%i-%i",
-                                       list_idx, seg_idx);
-                       fd = memfd_create(segname, flags);
-                       if (fd < 0) {
-                               RTE_LOG(DEBUG, EAL, "%s(): memfd create failed: %s\n",
-                                       __func__, strerror(errno));
-                               return -1;
-                       }
-                       fd_list[list_idx].fds[seg_idx] = fd;
-               }
-       }
-       return fd;
-#endif
-       return -1;
-}
-
-static int
-get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
-               unsigned int list_idx, unsigned int seg_idx)
-{
-       int fd;
-
-       /* for in-memory mode, we only make it here when we're sure we support
-        * memfd, and this is a special case.
-        */
-       if (internal_config.in_memory)
-               return get_seg_memfd(hi, list_idx, seg_idx);
-
-       if (internal_config.single_file_segments) {
-               /* create a hugepage file path */
-               eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
-
-               fd = fd_list[list_idx].memseg_list_fd;
-
-               if (fd < 0) {
-                       fd = open(path, O_CREAT | O_RDWR, 0600);
-                       if (fd < 0) {
-                               RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
-                                       __func__, strerror(errno));
-                               return -1;
-                       }
-                       /* take out a read lock and keep it indefinitely */
-                       if (lock(fd, LOCK_SH) < 0) {
-                               RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
-                                       __func__, strerror(errno));
-                               close(fd);
-                               return -1;
-                       }
-                       fd_list[list_idx].memseg_list_fd = fd;
-               }
-       } else {
-               /* create a hugepage file path */
-               eal_get_hugefile_path(path, buflen, hi->hugedir,
-                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
-
-               fd = fd_list[list_idx].fds[seg_idx];
-
-               if (fd < 0) {
-                       fd = open(path, O_CREAT | O_RDWR, 0600);
-                       if (fd < 0) {
-                               RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
-                                       __func__, strerror(errno));
-                               return -1;
-                       }
-                       /* take out a read lock */
-                       if (lock(fd, LOCK_SH) < 0) {
-                               RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
-                                       __func__, strerror(errno));
-                               close(fd);
-                               return -1;
-                       }
-                       fd_list[list_idx].fds[seg_idx] = fd;
-               }
-       }
-       return fd;
-}
-
-static int
-resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
-               uint64_t fa_offset, uint64_t page_sz, bool grow)
-{
-       bool again = false;
-
-       /* in-memory mode is a special case, because we don't need to perform
-        * any locking, and we can be sure that fallocate() is supported.
-        */
-       if (internal_config.in_memory) {
-               int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
-                               FALLOC_FL_KEEP_SIZE;
-               int ret;
-
-               /* grow or shrink the file */
-               ret = fallocate(fd, flags, fa_offset, page_sz);
-
-               if (ret < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
-                                       __func__,
-                                       strerror(errno));
-                       return -1;
-               }
-               /* increase/decrease total segment count */
-               fd_list[list_idx].count += (grow ? 1 : -1);
-               if (!grow && fd_list[list_idx].count == 0) {
-                       close(fd_list[list_idx].memseg_list_fd);
-                       fd_list[list_idx].memseg_list_fd = -1;
-               }
-               return 0;
-       }
-
-       do {
-               if (fallocate_supported == 0) {
-                       /* we cannot deallocate memory if fallocate() is not
-                        * supported, and hugepage file is already locked at
-                        * creation, so no further synchronization needed.
-                        */
-
-                       if (!grow) {
-                               RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
-                                       __func__);
-                               return -1;
-                       }
-                       uint64_t new_size = fa_offset + page_sz;
-                       uint64_t cur_size = get_file_size(fd);
-
-                       /* fallocate isn't supported, fall back to ftruncate */
-                       if (new_size > cur_size &&
-                                       ftruncate(fd, new_size) < 0) {
-                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
-                                       __func__, strerror(errno));
-                               return -1;
-                       }
-               } else {
-                       int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
-                                       FALLOC_FL_KEEP_SIZE;
-                       int ret, lock_fd;
-
-                       /* if fallocate() is supported, we need to take out a
-                        * read lock on allocate (to prevent other processes
-                        * from deallocating this page), and take out a write
-                        * lock on deallocate (to ensure nobody else is using
-                        * this page).
-                        *
-                        * read locks on page itself are already taken out at
-                        * file creation, in get_seg_fd().
-                        *
-                        * we cannot rely on simple use of flock() call, because
-                        * we need to be able to lock a section of the file,
-                        * and we cannot use fcntl() locks, because of numerous
-                        * problems with their semantics, so we will use
-                        * deterministically named lock files for each section
-                        * of the file.
-                        *
-                        * if we're shrinking the file, we want to upgrade our
-                        * lock from shared to exclusive.
-                        *
-                        * lock_fd is an fd for a lockfile, not for the segment
-                        * list.
-                        */
-                       lock_fd = get_segment_lock_fd(list_idx, seg_idx);
-
-                       if (!grow) {
-                               /* we are using this lockfile to determine
-                                * whether this particular page is locked, as we
-                                * are in single file segments mode and thus
-                                * cannot use regular flock() to get this info.
-                                *
-                                * we want to try and take out an exclusive lock
-                                * on the lock file to determine if we're the
-                                * last ones using this page, and if not, we
-                                * won't be shrinking it, and will instead exit
-                                * prematurely.
-                                */
-                               ret = lock(lock_fd, LOCK_EX);
-
-                               /* drop the lock on the lockfile, so that even
-                                * if we couldn't shrink the file ourselves, we
-                                * are signalling to other processes that we're
-                                * no longer using this page.
-                                */
-                               if (unlock_segment(list_idx, seg_idx))
-                                       RTE_LOG(ERR, EAL, "Could not unlock segment\n");
-
-                               /* additionally, if this was the last lock on
-                                * this segment list, we can safely close the
-                                * page file fd, so that one of the processes
-                                * could then delete the file after shrinking.
-                                */
-                               if (ret < 1 && fd_list[list_idx].count == 0) {
-                                       close(fd);
-                                       fd_list[list_idx].memseg_list_fd = -1;
-                               }
-
-                               if (ret < 0) {
-                                       RTE_LOG(ERR, EAL, "Could not lock segment\n");
-                                       return -1;
-                               }
-                               if (ret == 0)
-                                       /* failed to lock, not an error. */
-                                       return 0;
-                       }
-
-                       /* grow or shrink the file */
-                       ret = fallocate(fd, flags, fa_offset, page_sz);
-
-                       if (ret < 0) {
-                               if (fallocate_supported == -1 &&
-                                               errno == ENOTSUP) {
-                                       RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
-                                               __func__);
-                                       again = true;
-                                       fallocate_supported = 0;
-                               } else {
-                                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
-                                               __func__,
-                                               strerror(errno));
-                                       return -1;
-                               }
-                       } else {
-                               fallocate_supported = 1;
-
-                               /* we've grew/shrunk the file, and we hold an
-                                * exclusive lock now. check if there are no
-                                * more segments active in this segment list,
-                                * and remove the file if there aren't.
-                                */
-                               if (fd_list[list_idx].count == 0) {
-                                       if (unlink(path))
-                                               RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
-                                                       __func__, path,
-                                                       strerror(errno));
-                                       close(fd);
-                                       fd_list[list_idx].memseg_list_fd = -1;
-                               }
-                       }
-               }
-       } while (again);
-       return 0;
-}
-
-static int
-alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
-               struct hugepage_info *hi, unsigned int list_idx,
-               unsigned int seg_idx)
-{
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       int cur_socket_id = 0;
-#endif
-       uint64_t map_offset;
-       rte_iova_t iova;
-       void *va;
-       char path[PATH_MAX];
-       int ret = 0;
-       int fd;
-       size_t alloc_sz;
-       int flags;
-       void *new_addr;
-
-       alloc_sz = hi->hugepage_sz;
-
-       /* these are checked at init, but code analyzers don't know that */
-       if (internal_config.in_memory && !anonymous_hugepages_supported) {
-               RTE_LOG(ERR, EAL, "Anonymous hugepages not supported, in-memory mode cannot allocate memory\n");
-               return -1;
-       }
-       if (internal_config.in_memory && !memfd_create_supported &&
-                       internal_config.single_file_segments) {
-               RTE_LOG(ERR, EAL, "Single-file segments are not supported without memfd support\n");
-               return -1;
-       }
-
-       /* in-memory without memfd is a special case */
-       int mmap_flags;
-
-       if (internal_config.in_memory && !memfd_create_supported) {
-               const int in_memory_flags = MAP_HUGETLB | MAP_FIXED |
-                               MAP_PRIVATE | MAP_ANONYMOUS;
-               int pagesz_flag;
-
-               pagesz_flag = pagesz_flags(alloc_sz);
-               fd = -1;
-               mmap_flags = in_memory_flags | pagesz_flag;
-
-               /* single-file segments codepath will never be active
-                * here because in-memory mode is incompatible with the
-                * fallback path, and it's stopped at EAL initialization
-                * stage.
-                */
-               map_offset = 0;
-       } else {
-               /* takes out a read lock on segment or segment list */
-               fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-               if (fd < 0) {
-                       RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
-                       return -1;
-               }
-
-               if (internal_config.single_file_segments) {
-                       map_offset = seg_idx * alloc_sz;
-                       ret = resize_hugefile(fd, path, list_idx, seg_idx,
-                                       map_offset, alloc_sz, true);
-                       if (ret < 0)
-                               goto resized;
-               } else {
-                       map_offset = 0;
-                       if (ftruncate(fd, alloc_sz) < 0) {
-                               RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
-                                       __func__, strerror(errno));
-                               goto resized;
-                       }
-                       if (internal_config.hugepage_unlink &&
-                                       !internal_config.in_memory) {
-                               if (unlink(path)) {
-                                       RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
-                                               __func__, strerror(errno));
-                                       goto resized;
-                               }
-                       }
-               }
-               mmap_flags = MAP_SHARED | MAP_POPULATE | MAP_FIXED;
-       }
-
-       /*
-        * map the segment, and populate page tables, the kernel fills
-        * this segment with zeros if it's a new page.
-        */
-       va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, mmap_flags, fd,
-                       map_offset);
-
-       if (va == MAP_FAILED) {
-               RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
-                       strerror(errno));
-               /* mmap failed, but the previous region might have been
-                * unmapped anyway. try to remap it
-                */
-               goto unmapped;
-       }
-       if (va != addr) {
-               RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
-               munmap(va, alloc_sz);
-               goto resized;
-       }
-
-       /* In linux, hugetlb limitations, like cgroup, are
-        * enforced at fault time instead of mmap(), even
-        * with the option of MAP_POPULATE. Kernel will send
-        * a SIGBUS signal. To avoid to be killed, save stack
-        * environment here, if SIGBUS happens, we can jump
-        * back here.
-        */
-       if (huge_wrap_sigsetjmp()) {
-               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
-                       (unsigned int)(alloc_sz >> 20));
-               goto mapped;
-       }
-
-       /* we need to trigger a write to the page to enforce page fault and
-        * ensure that page is accessible to us, but we can't overwrite value
-        * that is already there, so read the old value, and write itback.
-        * kernel populates the page with zeroes initially.
-        */
-       *(volatile int *)addr = *(volatile int *)addr;
-
-       iova = rte_mem_virt2iova(addr);
-       if (iova == RTE_BAD_PHYS_ADDR) {
-               RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
-                       __func__);
-               goto mapped;
-       }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
-
-       if (cur_socket_id != socket_id) {
-               RTE_LOG(DEBUG, EAL,
-                               "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
-                       __func__, socket_id, cur_socket_id);
-               goto mapped;
-       }
-#endif
-
-       ms->addr = addr;
-       ms->hugepage_sz = alloc_sz;
-       ms->len = alloc_sz;
-       ms->nchannel = rte_memory_get_nchannel();
-       ms->nrank = rte_memory_get_nrank();
-       ms->iova = iova;
-       ms->socket_id = socket_id;
-
-       return 0;
-
-mapped:
-       munmap(addr, alloc_sz);
-unmapped:
-       flags = MAP_FIXED;
-       new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
-       if (new_addr != addr) {
-               if (new_addr != NULL)
-                       munmap(new_addr, alloc_sz);
-               /* we're leaving a hole in our virtual address space. if
-                * somebody else maps this hole now, we could accidentally
-                * override it in the future.
-                */
-               RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
-       }
-resized:
-       /* some codepaths will return negative fd, so exit early */
-       if (fd < 0)
-               return -1;
-
-       if (internal_config.single_file_segments) {
-               resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
-                               alloc_sz, false);
-               /* ignore failure, can't make it any worse */
-       } else {
-               /* only remove file if we can take out a write lock */
-               if (internal_config.hugepage_unlink == 0 &&
-                               internal_config.in_memory == 0 &&
-                               lock(fd, LOCK_EX) == 1)
-                       unlink(path);
-               close(fd);
-               fd_list[list_idx].fds[seg_idx] = -1;
-       }
-       return -1;
-}
-
-static int
-free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
-               unsigned int list_idx, unsigned int seg_idx)
-{
-       uint64_t map_offset;
-       char path[PATH_MAX];
-       int fd, ret = 0;
-       bool exit_early;
-
-       /* erase page data */
-       memset(ms->addr, 0, ms->len);
-
-       if (mmap(ms->addr, ms->len, PROT_READ,
-                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
-                               MAP_FAILED) {
-               RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
-               return -1;
-       }
-
-       exit_early = false;
-
-       /* if we're using anonymous hugepages, nothing to be done */
-       if (internal_config.in_memory && !memfd_create_supported)
-               exit_early = true;
-
-       /* if we've already unlinked the page, nothing needs to be done */
-       if (!internal_config.in_memory && internal_config.hugepage_unlink)
-               exit_early = true;
-
-       if (exit_early) {
-               memset(ms, 0, sizeof(*ms));
-               return 0;
-       }
-
-       /* if we are not in single file segments mode, we're going to unmap the
-        * segment and thus drop the lock on original fd, but hugepage dir is
-        * now locked so we can take out another one without races.
-        */
-       fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
-       if (fd < 0)
-               return -1;
-
-       if (internal_config.single_file_segments) {
-               map_offset = seg_idx * ms->len;
-               if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
-                               ms->len, false))
-                       return -1;
-               ret = 0;
-       } else {
-               /* if we're able to take out a write lock, we're the last one
-                * holding onto this page.
-                */
-               if (!internal_config.in_memory) {
-                       ret = lock(fd, LOCK_EX);
-                       if (ret >= 0) {
-                               /* no one else is using this page */
-                               if (ret == 1)
-                                       unlink(path);
-                       }
-               }
-               /* closing fd will drop the lock */
-               close(fd);
-               fd_list[list_idx].fds[seg_idx] = -1;
-       }
-
-       memset(ms, 0, sizeof(*ms));
-
-       return ret < 0 ? -1 : 0;
-}
-
-struct alloc_walk_param {
-       struct hugepage_info *hi;
-       struct rte_memseg **ms;
-       size_t page_sz;
-       unsigned int segs_allocated;
-       unsigned int n_segs;
-       int socket;
-       bool exact;
-};
-static int
-alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct alloc_walk_param *wa = arg;
-       struct rte_memseg_list *cur_msl;
-       size_t page_sz;
-       int cur_idx, start_idx, j, dir_fd = -1;
-       unsigned int msl_idx, need, i;
-
-       if (msl->page_sz != wa->page_sz)
-               return 0;
-       if (msl->socket_id != wa->socket)
-               return 0;
-
-       page_sz = (size_t)msl->page_sz;
-
-       msl_idx = msl - mcfg->memsegs;
-       cur_msl = &mcfg->memsegs[msl_idx];
-
-       need = wa->n_segs;
-
-       /* try finding space in memseg list */
-       cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
-       if (cur_idx < 0)
-               return 0;
-       start_idx = cur_idx;
-
-       /* do not allow any page allocations during the time we're allocating,
-        * because file creation and locking operations are not atomic,
-        * and we might be the first or the last ones to use a particular page,
-        * so we need to ensure atomicity of every operation.
-        *
-        * during init, we already hold a write lock, so don't try to take out
-        * another one.
-        */
-       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
-               dir_fd = open(wa->hi->hugedir, O_RDONLY);
-               if (dir_fd < 0) {
-                       RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
-                               __func__, wa->hi->hugedir, strerror(errno));
-                       return -1;
-               }
-               /* blocking writelock */
-               if (flock(dir_fd, LOCK_EX)) {
-                       RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
-                               __func__, wa->hi->hugedir, strerror(errno));
-                       close(dir_fd);
-                       return -1;
-               }
-       }
-
-       for (i = 0; i < need; i++, cur_idx++) {
-               struct rte_memseg *cur;
-               void *map_addr;
-
-               cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
-               map_addr = RTE_PTR_ADD(cur_msl->base_va,
-                               cur_idx * page_sz);
-
-               if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
-                               msl_idx, cur_idx)) {
-                       RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
-                               need, i);
-
-                       /* if exact number wasn't requested, stop */
-                       if (!wa->exact)
-                               goto out;
-
-                       /* clean up */
-                       for (j = start_idx; j < cur_idx; j++) {
-                               struct rte_memseg *tmp;
-                               struct rte_fbarray *arr =
-                                               &cur_msl->memseg_arr;
-
-                               tmp = rte_fbarray_get(arr, j);
-                               rte_fbarray_set_free(arr, j);
-
-                               /* free_seg may attempt to create a file, which
-                                * may fail.
-                                */
-                               if (free_seg(tmp, wa->hi, msl_idx, j))
-                                       RTE_LOG(DEBUG, EAL, "Cannot free page\n");
-                       }
-                       /* clear the list */
-                       if (wa->ms)
-                               memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
-
-                       if (dir_fd >= 0)
-                               close(dir_fd);
-                       return -1;
-               }
-               if (wa->ms)
-                       wa->ms[i] = cur;
-
-               rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
-       }
-out:
-       wa->segs_allocated = i;
-       if (i > 0)
-               cur_msl->version++;
-       if (dir_fd >= 0)
-               close(dir_fd);
-       return 1;
-}
-
-struct free_walk_param {
-       struct hugepage_info *hi;
-       struct rte_memseg *ms;
-};
-static int
-free_seg_walk(const struct rte_memseg_list *msl, void *arg)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct rte_memseg_list *found_msl;
-       struct free_walk_param *wa = arg;
-       uintptr_t start_addr, end_addr;
-       int msl_idx, seg_idx, ret, dir_fd = -1;
-
-       start_addr = (uintptr_t) msl->base_va;
-       end_addr = start_addr + msl->len;
-
-       if ((uintptr_t)wa->ms->addr < start_addr ||
-                       (uintptr_t)wa->ms->addr >= end_addr)
-               return 0;
-
-       msl_idx = msl - mcfg->memsegs;
-       seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
-
-       /* msl is const */
-       found_msl = &mcfg->memsegs[msl_idx];
-
-       /* do not allow any page allocations during the time we're freeing,
-        * because file creation and locking operations are not atomic,
-        * and we might be the first or the last ones to use a particular page,
-        * so we need to ensure atomicity of every operation.
-        *
-        * during init, we already hold a write lock, so don't try to take out
-        * another one.
-        */
-       if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
-               dir_fd = open(wa->hi->hugedir, O_RDONLY);
-               if (dir_fd < 0) {
-                       RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
-                               __func__, wa->hi->hugedir, strerror(errno));
-                       return -1;
-               }
-               /* blocking writelock */
-               if (flock(dir_fd, LOCK_EX)) {
-                       RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
-                               __func__, wa->hi->hugedir, strerror(errno));
-                       close(dir_fd);
-                       return -1;
-               }
-       }
-
-       found_msl->version++;
-
-       rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
-
-       ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
-
-       if (dir_fd >= 0)
-               close(dir_fd);
-
-       if (ret < 0)
-               return -1;
-
-       return 1;
-}
-
-int
-eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
-               int socket, bool exact)
-{
-       int i, ret = -1;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       bool have_numa = false;
-       int oldpolicy;
-       struct bitmask *oldmask;
-#endif
-       struct alloc_walk_param wa;
-       struct hugepage_info *hi = NULL;
-
-       memset(&wa, 0, sizeof(wa));
-
-       /* dynamic allocation not supported in legacy mode */
-       if (internal_config.legacy_mem)
-               return -1;
-
-       for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
-               if (page_sz ==
-                               internal_config.hugepage_info[i].hugepage_sz) {
-                       hi = &internal_config.hugepage_info[i];
-                       break;
-               }
-       }
-       if (!hi) {
-               RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
-                       __func__);
-               return -1;
-       }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       if (check_numa()) {
-               oldmask = numa_allocate_nodemask();
-               prepare_numa(&oldpolicy, oldmask, socket);
-               have_numa = true;
-       }
-#endif
-
-       wa.exact = exact;
-       wa.hi = hi;
-       wa.ms = ms;
-       wa.n_segs = n_segs;
-       wa.page_sz = page_sz;
-       wa.socket = socket;
-       wa.segs_allocated = 0;
-
-       /* memalloc is locked, so it's safe to use thread-unsafe version */
-       ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
-       if (ret == 0) {
-               RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
-                       __func__);
-               ret = -1;
-       } else if (ret > 0) {
-               ret = (int)wa.segs_allocated;
-       }
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       if (have_numa)
-               restore_numa(&oldpolicy, oldmask);
-#endif
-       return ret;
-}
-
-struct rte_memseg *
-eal_memalloc_alloc_seg(size_t page_sz, int socket)
-{
-       struct rte_memseg *ms;
-       if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
-               return NULL;
-       /* return pointer to newly allocated memseg */
-       return ms;
-}
-
-int
-eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
-{
-       int seg, ret = 0;
-
-       /* dynamic free not supported in legacy mode */
-       if (internal_config.legacy_mem)
-               return -1;
-
-       for (seg = 0; seg < n_segs; seg++) {
-               struct rte_memseg *cur = ms[seg];
-               struct hugepage_info *hi = NULL;
-               struct free_walk_param wa;
-               int i, walk_res;
-
-               /* if this page is marked as unfreeable, fail */
-               if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
-                       RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
-                       ret = -1;
-                       continue;
-               }
-
-               memset(&wa, 0, sizeof(wa));
-
-               for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
-                               i++) {
-                       hi = &internal_config.hugepage_info[i];
-                       if (cur->hugepage_sz == hi->hugepage_sz)
-                               break;
-               }
-               if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
-                       RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
-                       ret = -1;
-                       continue;
-               }
-
-               wa.ms = cur;
-               wa.hi = hi;
-
-               /* memalloc is locked, so it's safe to use thread-unsafe version
-                */
-               walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
-                               &wa);
-               if (walk_res == 1)
-                       continue;
-               if (walk_res == 0)
-                       RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
-               ret = -1;
-       }
-       return ret;
-}
-
-int
-eal_memalloc_free_seg(struct rte_memseg *ms)
-{
-       /* dynamic free not supported in legacy mode */
-       if (internal_config.legacy_mem)
-               return -1;
-
-       return eal_memalloc_free_seg_bulk(&ms, 1);
-}
-
-static int
-sync_chunk(struct rte_memseg_list *primary_msl,
-               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
-               unsigned int msl_idx, bool used, int start, int end)
-{
-       struct rte_fbarray *l_arr, *p_arr;
-       int i, ret, chunk_len, diff_len;
-
-       l_arr = &local_msl->memseg_arr;
-       p_arr = &primary_msl->memseg_arr;
-
-       /* we need to aggregate allocations/deallocations into bigger chunks,
-        * as we don't want to spam the user with per-page callbacks.
-        *
-        * to avoid any potential issues, we also want to trigger
-        * deallocation callbacks *before* we actually deallocate
-        * memory, so that the user application could wrap up its use
-        * before it goes away.
-        */
-
-       chunk_len = end - start;
-
-       /* find how many contiguous pages we can map/unmap for this chunk */
-       diff_len = used ?
-                       rte_fbarray_find_contig_free(l_arr, start) :
-                       rte_fbarray_find_contig_used(l_arr, start);
-
-       /* has to be at least one page */
-       if (diff_len < 1)
-               return -1;
-
-       diff_len = RTE_MIN(chunk_len, diff_len);
-
-       /* if we are freeing memory, notify the application */
-       if (!used) {
-               struct rte_memseg *ms;
-               void *start_va;
-               size_t len, page_sz;
-
-               ms = rte_fbarray_get(l_arr, start);
-               start_va = ms->addr;
-               page_sz = (size_t)primary_msl->page_sz;
-               len = page_sz * diff_len;
-
-               eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
-                               start_va, len);
-       }
-
-       for (i = 0; i < diff_len; i++) {
-               struct rte_memseg *p_ms, *l_ms;
-               int seg_idx = start + i;
-
-               l_ms = rte_fbarray_get(l_arr, seg_idx);
-               p_ms = rte_fbarray_get(p_arr, seg_idx);
-
-               if (l_ms == NULL || p_ms == NULL)
-                       return -1;
-
-               if (used) {
-                       ret = alloc_seg(l_ms, p_ms->addr,
-                                       p_ms->socket_id, hi,
-                                       msl_idx, seg_idx);
-                       if (ret < 0)
-                               return -1;
-                       rte_fbarray_set_used(l_arr, seg_idx);
-               } else {
-                       ret = free_seg(l_ms, hi, msl_idx, seg_idx);
-                       rte_fbarray_set_free(l_arr, seg_idx);
-                       if (ret < 0)
-                               return -1;
-               }
-       }
-
-       /* if we just allocated memory, notify the application */
-       if (used) {
-               struct rte_memseg *ms;
-               void *start_va;
-               size_t len, page_sz;
-
-               ms = rte_fbarray_get(l_arr, start);
-               start_va = ms->addr;
-               page_sz = (size_t)primary_msl->page_sz;
-               len = page_sz * diff_len;
-
-               eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
-                               start_va, len);
-       }
-
-       /* calculate how much we can advance until next chunk */
-       diff_len = used ?
-                       rte_fbarray_find_contig_used(l_arr, start) :
-                       rte_fbarray_find_contig_free(l_arr, start);
-       ret = RTE_MIN(chunk_len, diff_len);
-
-       return ret;
-}
-
-static int
-sync_status(struct rte_memseg_list *primary_msl,
-               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
-               unsigned int msl_idx, bool used)
-{
-       struct rte_fbarray *l_arr, *p_arr;
-       int p_idx, l_chunk_len, p_chunk_len, ret;
-       int start, end;
-
-       /* this is a little bit tricky, but the basic idea is - walk both lists
-        * and spot any places where there are discrepancies. walking both lists
-        * and noting discrepancies in a single go is a hard problem, so we do
-        * it in two passes - first we spot any places where allocated segments
-        * mismatch (i.e. ensure that everything that's allocated in the primary
-        * is also allocated in the secondary), and then we do it by looking at
-        * free segments instead.
-        *
-        * we also need to aggregate changes into chunks, as we have to call
-        * callbacks per allocation, not per page.
-        */
-       l_arr = &local_msl->memseg_arr;
-       p_arr = &primary_msl->memseg_arr;
-
-       if (used)
-               p_idx = rte_fbarray_find_next_used(p_arr, 0);
-       else
-               p_idx = rte_fbarray_find_next_free(p_arr, 0);
-
-       while (p_idx >= 0) {
-               int next_chunk_search_idx;
-
-               if (used) {
-                       p_chunk_len = rte_fbarray_find_contig_used(p_arr,
-                                       p_idx);
-                       l_chunk_len = rte_fbarray_find_contig_used(l_arr,
-                                       p_idx);
-               } else {
-                       p_chunk_len = rte_fbarray_find_contig_free(p_arr,
-                                       p_idx);
-                       l_chunk_len = rte_fbarray_find_contig_free(l_arr,
-                                       p_idx);
-               }
-               /* best case scenario - no differences (or bigger, which will be
-                * fixed during next iteration), look for next chunk
-                */
-               if (l_chunk_len >= p_chunk_len) {
-                       next_chunk_search_idx = p_idx + p_chunk_len;
-                       goto next_chunk;
-               }
-
-               /* if both chunks start at the same point, skip parts we know
-                * are identical, and sync the rest. each call to sync_chunk
-                * will only sync contiguous segments, so we need to call this
-                * until we are sure there are no more differences in this
-                * chunk.
-                */
-               start = p_idx + l_chunk_len;
-               end = p_idx + p_chunk_len;
-               do {
-                       ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
-                                       used, start, end);
-                       start += ret;
-               } while (start < end && ret >= 0);
-               /* if ret is negative, something went wrong */
-               if (ret < 0)
-                       return -1;
-
-               next_chunk_search_idx = p_idx + p_chunk_len;
-next_chunk:
-               /* skip to end of this chunk */
-               if (used) {
-                       p_idx = rte_fbarray_find_next_used(p_arr,
-                                       next_chunk_search_idx);
-               } else {
-                       p_idx = rte_fbarray_find_next_free(p_arr,
-                                       next_chunk_search_idx);
-               }
-       }
-       return 0;
-}
-
-static int
-sync_existing(struct rte_memseg_list *primary_msl,
-               struct rte_memseg_list *local_msl, struct hugepage_info *hi,
-               unsigned int msl_idx)
-{
-       int ret, dir_fd;
-
-       /* do not allow any page allocations during the time we're allocating,
-        * because file creation and locking operations are not atomic,
-        * and we might be the first or the last ones to use a particular page,
-        * so we need to ensure atomicity of every operation.
-        */
-       dir_fd = open(hi->hugedir, O_RDONLY);
-       if (dir_fd < 0) {
-               RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
-                       hi->hugedir, strerror(errno));
-               return -1;
-       }
-       /* blocking writelock */
-       if (flock(dir_fd, LOCK_EX)) {
-               RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
-                       hi->hugedir, strerror(errno));
-               close(dir_fd);
-               return -1;
-       }
-
-       /* ensure all allocated space is the same in both lists */
-       ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
-       if (ret < 0)
-               goto fail;
-
-       /* ensure all unallocated space is the same in both lists */
-       ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
-       if (ret < 0)
-               goto fail;
-
-       /* update version number */
-       local_msl->version = primary_msl->version;
-
-       close(dir_fd);
-
-       return 0;
-fail:
-       close(dir_fd);
-       return -1;
-}
-
-static int
-sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct rte_memseg_list *primary_msl, *local_msl;
-       struct hugepage_info *hi = NULL;
-       unsigned int i;
-       int msl_idx;
-
-       if (msl->external)
-               return 0;
-
-       msl_idx = msl - mcfg->memsegs;
-       primary_msl = &mcfg->memsegs[msl_idx];
-       local_msl = &local_memsegs[msl_idx];
-
-       for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
-               uint64_t cur_sz =
-                       internal_config.hugepage_info[i].hugepage_sz;
-               uint64_t msl_sz = primary_msl->page_sz;
-               if (msl_sz == cur_sz) {
-                       hi = &internal_config.hugepage_info[i];
-                       break;
-               }
-       }
-       if (!hi) {
-               RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
-               return -1;
-       }
-
-       /* if versions don't match, synchronize everything */
-       if (local_msl->version != primary_msl->version &&
-                       sync_existing(primary_msl, local_msl, hi, msl_idx))
-               return -1;
-       return 0;
-}
-
-
-int
-eal_memalloc_sync_with_primary(void)
-{
-       /* nothing to be done in primary */
-       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-               return 0;
-
-       /* memalloc is locked, so it's safe to call thread-unsafe version */
-       if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
-               return -1;
-       return 0;
-}
-
-static int
-secondary_msl_create_walk(const struct rte_memseg_list *msl,
-               void *arg __rte_unused)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct rte_memseg_list *primary_msl, *local_msl;
-       char name[PATH_MAX];
-       int msl_idx, ret;
-
-       if (msl->external)
-               return 0;
-
-       msl_idx = msl - mcfg->memsegs;
-       primary_msl = &mcfg->memsegs[msl_idx];
-       local_msl = &local_memsegs[msl_idx];
-
-       /* create distinct fbarrays for each secondary */
-       snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
-               primary_msl->memseg_arr.name, getpid());
-
-       ret = rte_fbarray_init(&local_msl->memseg_arr, name,
-               primary_msl->memseg_arr.len,
-               primary_msl->memseg_arr.elt_sz);
-       if (ret < 0) {
-               RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
-               return -1;
-       }
-       local_msl->base_va = primary_msl->base_va;
-       local_msl->len = primary_msl->len;
-
-       return 0;
-}
-
-static int
-alloc_list(int list_idx, int len)
-{
-       int *data;
-       int i;
-
-       /* ensure we have space to store fd per each possible segment */
-       data = malloc(sizeof(int) * len);
-       if (data == NULL) {
-               RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
-               return -1;
-       }
-       /* set all fd's as invalid */
-       for (i = 0; i < len; i++)
-               data[i] = -1;
-
-       fd_list[list_idx].fds = data;
-       fd_list[list_idx].len = len;
-       fd_list[list_idx].count = 0;
-       fd_list[list_idx].memseg_list_fd = -1;
-
-       return 0;
-}
-
-static int
-fd_list_create_walk(const struct rte_memseg_list *msl,
-               void *arg __rte_unused)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       unsigned int len;
-       int msl_idx;
-
-       if (msl->external)
-               return 0;
-
-       msl_idx = msl - mcfg->memsegs;
-       len = msl->memseg_arr.len;
-
-       return alloc_list(msl_idx, len);
-}
-
-int
-eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-
-       /* single file segments mode doesn't support individual segment fd's */
-       if (internal_config.single_file_segments)
-               return -ENOTSUP;
-
-       /* if list is not allocated, allocate it */
-       if (fd_list[list_idx].len == 0) {
-               int len = mcfg->memsegs[list_idx].memseg_arr.len;
-
-               if (alloc_list(list_idx, len) < 0)
-                       return -ENOMEM;
-       }
-       fd_list[list_idx].fds[seg_idx] = fd;
-
-       return 0;
-}
-
-int
-eal_memalloc_set_seg_list_fd(int list_idx, int fd)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-
-       /* non-single file segment mode doesn't support segment list fd's */
-       if (!internal_config.single_file_segments)
-               return -ENOTSUP;
-
-       /* if list is not allocated, allocate it */
-       if (fd_list[list_idx].len == 0) {
-               int len = mcfg->memsegs[list_idx].memseg_arr.len;
-
-               if (alloc_list(list_idx, len) < 0)
-                       return -ENOMEM;
-       }
-
-       fd_list[list_idx].memseg_list_fd = fd;
-
-       return 0;
-}
-
-int
-eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
-{
-       int fd;
-
-       if (internal_config.in_memory || internal_config.no_hugetlbfs) {
-#ifndef MEMFD_SUPPORTED
-               /* in in-memory or no-huge mode, we rely on memfd support */
-               return -ENOTSUP;
-#endif
-               /* memfd supported, but hugetlbfs memfd may not be */
-               if (!internal_config.no_hugetlbfs && !memfd_create_supported)
-                       return -ENOTSUP;
-       }
-
-       if (internal_config.single_file_segments) {
-               fd = fd_list[list_idx].memseg_list_fd;
-       } else if (fd_list[list_idx].len == 0) {
-               /* list not initialized */
-               fd = -1;
-       } else {
-               fd = fd_list[list_idx].fds[seg_idx];
-       }
-       if (fd < 0)
-               return -ENODEV;
-       return fd;
-}
-
-static int
-test_memfd_create(void)
-{
-#ifdef MEMFD_SUPPORTED
-       unsigned int i;
-       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
-               uint64_t pagesz = internal_config.hugepage_info[i].hugepage_sz;
-               int pagesz_flag = pagesz_flags(pagesz);
-               int flags;
-
-               flags = pagesz_flag | RTE_MFD_HUGETLB;
-               int fd = memfd_create("test", flags);
-               if (fd < 0) {
-                       /* we failed - let memalloc know this isn't working */
-                       if (errno == EINVAL) {
-                               memfd_create_supported = 0;
-                               return 0; /* not supported */
-                       }
-
-                       /* we got other error - something's wrong */
-                       return -1; /* error */
-               }
-               close(fd);
-               return 1; /* supported */
-       }
-#endif
-       return 0; /* not supported */
-}
-
-int
-eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-
-       if (internal_config.in_memory || internal_config.no_hugetlbfs) {
-#ifndef MEMFD_SUPPORTED
-               /* in in-memory or no-huge mode, we rely on memfd support */
-               return -ENOTSUP;
-#endif
-               /* memfd supported, but hugetlbfs memfd may not be */
-               if (!internal_config.no_hugetlbfs && !memfd_create_supported)
-                       return -ENOTSUP;
-       }
-
-       /* fd_list not initialized? */
-       if (fd_list[list_idx].len == 0)
-               return -ENODEV;
-       if (internal_config.single_file_segments) {
-               size_t pgsz = mcfg->memsegs[list_idx].page_sz;
-
-               /* segment not active? */
-               if (fd_list[list_idx].memseg_list_fd < 0)
-                       return -ENOENT;
-               *offset = pgsz * seg_idx;
-       } else {
-               /* segment not active? */
-               if (fd_list[list_idx].fds[seg_idx] < 0)
-                       return -ENOENT;
-               *offset = 0;
-       }
-       return 0;
-}
-
-int
-eal_memalloc_init(void)
-{
-       if (rte_eal_process_type() == RTE_PROC_SECONDARY)
-               if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
-                       return -1;
-       if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
-                       internal_config.in_memory) {
-               int mfd_res = test_memfd_create();
-
-               if (mfd_res < 0) {
-                       RTE_LOG(ERR, EAL, "Unable to check if memfd is supported\n");
-                       return -1;
-               }
-               if (mfd_res == 1)
-                       RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
-               else
-                       RTE_LOG(INFO, EAL, "Using memfd is not supported, falling back to anonymous hugepages\n");
-
-               /* we only support single-file segments mode with in-memory mode
-                * if we support hugetlbfs with memfd_create. this code will
-                * test if we do.
-                */
-               if (internal_config.single_file_segments &&
-                               mfd_res != 1) {
-                       RTE_LOG(ERR, EAL, "Single-file segments mode cannot be used without memfd support\n");
-                       return -1;
-               }
-               /* this cannot ever happen but better safe than sorry */
-               if (!anonymous_hugepages_supported) {
-                       RTE_LOG(ERR, EAL, "Using anonymous memory is not supported\n");
-                       return -1;
-               }
-       }
-
-       /* initialize all of the fd lists */
-       if (rte_memseg_list_walk(fd_list_create_walk, NULL))
-               return -1;
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
deleted file mode 100644 (file)
index 1b96b57..0000000
+++ /dev/null
@@ -1,2439 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright(c) 2013 6WIND S.A.
- */
-
-#define _FILE_OFFSET_BITS 64
-#include <errno.h>
-#include <fcntl.h>
-#include <stdarg.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/queue.h>
-#include <sys/file.h>
-#include <sys/resource.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sys/ioctl.h>
-#include <sys/time.h>
-#include <signal.h>
-#include <setjmp.h>
-#ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
-#include <linux/memfd.h>
-#define MEMFD_SUPPORTED
-#endif
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-#include <numa.h>
-#include <numaif.h>
-#endif
-
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_launch.h>
-#include <rte_eal.h>
-#include <rte_eal_memconfig.h>
-#include <rte_per_lcore.h>
-#include <rte_lcore.h>
-#include <rte_common.h>
-#include <rte_string_fns.h>
-
-#include "eal_private.h"
-#include "eal_memalloc.h"
-#include "eal_internal_cfg.h"
-#include "eal_filesystem.h"
-#include "eal_hugepages.h"
-
-#define PFN_MASK_SIZE  8
-
-/**
- * @file
- * Huge page mapping under linux
- *
- * To reserve a big contiguous amount of memory, we use the hugepage
- * feature of linux. For that, we need to have hugetlbfs mounted. This
- * code will create many files in this directory (one per page) and
- * map them in virtual memory. For each page, we will retrieve its
- * physical address and remap it in order to have a virtual contiguous
- * zone as well as a physical contiguous zone.
- */
-
-static bool phys_addrs_available = true;
-
-#define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
-
-static void
-test_phys_addrs_available(void)
-{
-       uint64_t tmp = 0;
-       phys_addr_t physaddr;
-
-       if (!rte_eal_has_hugepages()) {
-               RTE_LOG(ERR, EAL,
-                       "Started without hugepages support, physical addresses not available\n");
-               phys_addrs_available = false;
-               return;
-       }
-
-       physaddr = rte_mem_virt2phy(&tmp);
-       if (physaddr == RTE_BAD_PHYS_ADDR) {
-               if (rte_eal_iova_mode() == RTE_IOVA_PA)
-                       RTE_LOG(ERR, EAL,
-                               "Cannot obtain physical addresses: %s. "
-                               "Only vfio will function.\n",
-                               strerror(errno));
-               phys_addrs_available = false;
-       }
-}
-
-/*
- * Get physical address of any mapped virtual address in the current process.
- */
-phys_addr_t
-rte_mem_virt2phy(const void *virtaddr)
-{
-       int fd, retval;
-       uint64_t page, physaddr;
-       unsigned long virt_pfn;
-       int page_size;
-       off_t offset;
-
-       /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */
-       if (!phys_addrs_available)
-               return RTE_BAD_IOVA;
-
-       /* standard page size */
-       page_size = getpagesize();
-
-       fd = open("/proc/self/pagemap", O_RDONLY);
-       if (fd < 0) {
-               RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
-                       __func__, strerror(errno));
-               return RTE_BAD_IOVA;
-       }
-
-       virt_pfn = (unsigned long)virtaddr / page_size;
-       offset = sizeof(uint64_t) * virt_pfn;
-       if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
-               RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
-                               __func__, strerror(errno));
-               close(fd);
-               return RTE_BAD_IOVA;
-       }
-
-       retval = read(fd, &page, PFN_MASK_SIZE);
-       close(fd);
-       if (retval < 0) {
-               RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
-                               __func__, strerror(errno));
-               return RTE_BAD_IOVA;
-       } else if (retval != PFN_MASK_SIZE) {
-               RTE_LOG(ERR, EAL, "%s(): read %d bytes from /proc/self/pagemap "
-                               "but expected %d:\n",
-                               __func__, retval, PFN_MASK_SIZE);
-               return RTE_BAD_IOVA;
-       }
-
-       /*
-        * the pfn (page frame number) are bits 0-54 (see
-        * pagemap.txt in linux Documentation)
-        */
-       if ((page & 0x7fffffffffffffULL) == 0)
-               return RTE_BAD_IOVA;
-
-       physaddr = ((page & 0x7fffffffffffffULL) * page_size)
-               + ((unsigned long)virtaddr % page_size);
-
-       return physaddr;
-}
-
-rte_iova_t
-rte_mem_virt2iova(const void *virtaddr)
-{
-       if (rte_eal_iova_mode() == RTE_IOVA_VA)
-               return (uintptr_t)virtaddr;
-       return rte_mem_virt2phy(virtaddr);
-}
-
-/*
- * For each hugepage in hugepg_tbl, fill the physaddr value. We find
- * it by browsing the /proc/self/pagemap special file.
- */
-static int
-find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
-       unsigned int i;
-       phys_addr_t addr;
-
-       for (i = 0; i < hpi->num_pages[0]; i++) {
-               addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
-               if (addr == RTE_BAD_PHYS_ADDR)
-                       return -1;
-               hugepg_tbl[i].physaddr = addr;
-       }
-       return 0;
-}
-
-/*
- * For each hugepage in hugepg_tbl, fill the physaddr value sequentially.
- */
-static int
-set_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
-       unsigned int i;
-       static phys_addr_t addr;
-
-       for (i = 0; i < hpi->num_pages[0]; i++) {
-               hugepg_tbl[i].physaddr = addr;
-               addr += hugepg_tbl[i].size;
-       }
-       return 0;
-}
-
-/*
- * Check whether address-space layout randomization is enabled in
- * the kernel. This is important for multi-process as it can prevent
- * two processes mapping data to the same virtual address
- * Returns:
- *    0 - address space randomization disabled
- *    1/2 - address space randomization enabled
- *    negative error code on error
- */
-static int
-aslr_enabled(void)
-{
-       char c;
-       int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
-       if (fd < 0)
-               return -errno;
-       retval = read(fd, &c, 1);
-       close(fd);
-       if (retval < 0)
-               return -errno;
-       if (retval == 0)
-               return -EIO;
-       switch (c) {
-               case '0' : return 0;
-               case '1' : return 1;
-               case '2' : return 2;
-               default: return -EINVAL;
-       }
-}
-
-static sigjmp_buf huge_jmpenv;
-
-static void huge_sigbus_handler(int signo __rte_unused)
-{
-       siglongjmp(huge_jmpenv, 1);
-}
-
-/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
- * non-static local variable in the stack frame calling sigsetjmp might be
- * clobbered by a call to longjmp.
- */
-static int huge_wrap_sigsetjmp(void)
-{
-       return sigsetjmp(huge_jmpenv, 1);
-}
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-/* Callback for numa library. */
-void numa_error(char *where)
-{
-       RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
-}
-#endif
-
-/*
- * Mmap all hugepages of hugepage table: it first open a file in
- * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
- * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
- * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
- * map contiguous physical blocks in contiguous virtual blocks.
- */
-static unsigned
-map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
-                 uint64_t *essential_memory __rte_unused)
-{
-       int fd;
-       unsigned i;
-       void *virtaddr;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       int node_id = -1;
-       int essential_prev = 0;
-       int oldpolicy;
-       struct bitmask *oldmask = NULL;
-       bool have_numa = true;
-       unsigned long maxnode = 0;
-
-       /* Check if kernel supports NUMA. */
-       if (numa_available() != 0) {
-               RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
-               have_numa = false;
-       }
-
-       if (have_numa) {
-               RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
-               oldmask = numa_allocate_nodemask();
-               if (get_mempolicy(&oldpolicy, oldmask->maskp,
-                                 oldmask->size + 1, 0, 0) < 0) {
-                       RTE_LOG(ERR, EAL,
-                               "Failed to get current mempolicy: %s. "
-                               "Assuming MPOL_DEFAULT.\n", strerror(errno));
-                       oldpolicy = MPOL_DEFAULT;
-               }
-               for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
-                       if (internal_config.socket_mem[i])
-                               maxnode = i + 1;
-       }
-#endif
-
-       for (i = 0; i < hpi->num_pages[0]; i++) {
-               struct hugepage_file *hf = &hugepg_tbl[i];
-               uint64_t hugepage_sz = hpi->hugepage_sz;
-
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-               if (maxnode) {
-                       unsigned int j;
-
-                       for (j = 0; j < maxnode; j++)
-                               if (essential_memory[j])
-                                       break;
-
-                       if (j == maxnode) {
-                               node_id = (node_id + 1) % maxnode;
-                               while (!internal_config.socket_mem[node_id]) {
-                                       node_id++;
-                                       node_id %= maxnode;
-                               }
-                               essential_prev = 0;
-                       } else {
-                               node_id = j;
-                               essential_prev = essential_memory[j];
-
-                               if (essential_memory[j] < hugepage_sz)
-                                       essential_memory[j] = 0;
-                               else
-                                       essential_memory[j] -= hugepage_sz;
-                       }
-
-                       RTE_LOG(DEBUG, EAL,
-                               "Setting policy MPOL_PREFERRED for socket %d\n",
-                               node_id);
-                       numa_set_preferred(node_id);
-               }
-#endif
-
-               hf->file_id = i;
-               hf->size = hugepage_sz;
-               eal_get_hugefile_path(hf->filepath, sizeof(hf->filepath),
-                               hpi->hugedir, hf->file_id);
-               hf->filepath[sizeof(hf->filepath) - 1] = '\0';
-
-               /* try to create hugepage file */
-               fd = open(hf->filepath, O_CREAT | O_RDWR, 0600);
-               if (fd < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
-                                       strerror(errno));
-                       goto out;
-               }
-
-               /* map the segment, and populate page tables,
-                * the kernel fills this segment with zeros. we don't care where
-                * this gets mapped - we already have contiguous memory areas
-                * ready for us to map into.
-                */
-               virtaddr = mmap(NULL, hugepage_sz, PROT_READ | PROT_WRITE,
-                               MAP_SHARED | MAP_POPULATE, fd, 0);
-               if (virtaddr == MAP_FAILED) {
-                       RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
-                                       strerror(errno));
-                       close(fd);
-                       goto out;
-               }
-
-               hf->orig_va = virtaddr;
-
-               /* In linux, hugetlb limitations, like cgroup, are
-                * enforced at fault time instead of mmap(), even
-                * with the option of MAP_POPULATE. Kernel will send
-                * a SIGBUS signal. To avoid to be killed, save stack
-                * environment here, if SIGBUS happens, we can jump
-                * back here.
-                */
-               if (huge_wrap_sigsetjmp()) {
-                       RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
-                               "hugepages of size %u MB\n",
-                               (unsigned int)(hugepage_sz / 0x100000));
-                       munmap(virtaddr, hugepage_sz);
-                       close(fd);
-                       unlink(hugepg_tbl[i].filepath);
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-                       if (maxnode)
-                               essential_memory[node_id] =
-                                       essential_prev;
-#endif
-                       goto out;
-               }
-               *(int *)virtaddr = 0;
-
-               /* set shared lock on the file. */
-               if (flock(fd, LOCK_SH) < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
-                               __func__, strerror(errno));
-                       close(fd);
-                       goto out;
-               }
-
-               close(fd);
-       }
-
-out:
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-       if (maxnode) {
-               RTE_LOG(DEBUG, EAL,
-                       "Restoring previous memory policy: %d\n", oldpolicy);
-               if (oldpolicy == MPOL_DEFAULT) {
-                       numa_set_localalloc();
-               } else if (set_mempolicy(oldpolicy, oldmask->maskp,
-                                        oldmask->size + 1) < 0) {
-                       RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
-                               strerror(errno));
-                       numa_set_localalloc();
-               }
-       }
-       if (oldmask != NULL)
-               numa_free_cpumask(oldmask);
-#endif
-       return i;
-}
-
-/*
- * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
- * page.
- */
-static int
-find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
-{
-       int socket_id;
-       char *end, *nodestr;
-       unsigned i, hp_count = 0;
-       uint64_t virt_addr;
-       char buf[BUFSIZ];
-       char hugedir_str[PATH_MAX];
-       FILE *f;
-
-       f = fopen("/proc/self/numa_maps", "r");
-       if (f == NULL) {
-               RTE_LOG(NOTICE, EAL, "NUMA support not available"
-                       " consider that all memory is in socket_id 0\n");
-               return 0;
-       }
-
-       snprintf(hugedir_str, sizeof(hugedir_str),
-                       "%s/%s", hpi->hugedir, eal_get_hugefile_prefix());
-
-       /* parse numa map */
-       while (fgets(buf, sizeof(buf), f) != NULL) {
-
-               /* ignore non huge page */
-               if (strstr(buf, " huge ") == NULL &&
-                               strstr(buf, hugedir_str) == NULL)
-                       continue;
-
-               /* get zone addr */
-               virt_addr = strtoull(buf, &end, 16);
-               if (virt_addr == 0 || end == buf) {
-                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
-                       goto error;
-               }
-
-               /* get node id (socket id) */
-               nodestr = strstr(buf, " N");
-               if (nodestr == NULL) {
-                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
-                       goto error;
-               }
-               nodestr += 2;
-               end = strstr(nodestr, "=");
-               if (end == NULL) {
-                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
-                       goto error;
-               }
-               end[0] = '\0';
-               end = NULL;
-
-               socket_id = strtoul(nodestr, &end, 0);
-               if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
-                       RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
-                       goto error;
-               }
-
-               /* if we find this page in our mappings, set socket_id */
-               for (i = 0; i < hpi->num_pages[0]; i++) {
-                       void *va = (void *)(unsigned long)virt_addr;
-                       if (hugepg_tbl[i].orig_va == va) {
-                               hugepg_tbl[i].socket_id = socket_id;
-                               hp_count++;
-#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
-                               RTE_LOG(DEBUG, EAL,
-                                       "Hugepage %s is on socket %d\n",
-                                       hugepg_tbl[i].filepath, socket_id);
-#endif
-                       }
-               }
-       }
-
-       if (hp_count < hpi->num_pages[0])
-               goto error;
-
-       fclose(f);
-       return 0;
-
-error:
-       fclose(f);
-       return -1;
-}
-
-static int
-cmp_physaddr(const void *a, const void *b)
-{
-#ifndef RTE_ARCH_PPC_64
-       const struct hugepage_file *p1 = a;
-       const struct hugepage_file *p2 = b;
-#else
-       /* PowerPC needs memory sorted in reverse order from x86 */
-       const struct hugepage_file *p1 = b;
-       const struct hugepage_file *p2 = a;
-#endif
-       if (p1->physaddr < p2->physaddr)
-               return -1;
-       else if (p1->physaddr > p2->physaddr)
-               return 1;
-       else
-               return 0;
-}
-
-/*
- * Uses mmap to create a shared memory area for storage of data
- * Used in this file to store the hugepage file map on disk
- */
-static void *
-create_shared_memory(const char *filename, const size_t mem_size)
-{
-       void *retval;
-       int fd;
-
-       /* if no shared files mode is used, create anonymous memory instead */
-       if (internal_config.no_shconf) {
-               retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
-                               MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-               if (retval == MAP_FAILED)
-                       return NULL;
-               return retval;
-       }
-
-       fd = open(filename, O_CREAT | O_RDWR, 0666);
-       if (fd < 0)
-               return NULL;
-       if (ftruncate(fd, mem_size) < 0) {
-               close(fd);
-               return NULL;
-       }
-       retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-       close(fd);
-       if (retval == MAP_FAILED)
-               return NULL;
-       return retval;
-}
-
-/*
- * this copies *active* hugepages from one hugepage table to another.
- * destination is typically the shared memory.
- */
-static int
-copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
-               const struct hugepage_file * src, int src_size)
-{
-       int src_pos, dst_pos = 0;
-
-       for (src_pos = 0; src_pos < src_size; src_pos++) {
-               if (src[src_pos].orig_va != NULL) {
-                       /* error on overflow attempt */
-                       if (dst_pos == dest_size)
-                               return -1;
-                       memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
-                       dst_pos++;
-               }
-       }
-       return 0;
-}
-
-static int
-unlink_hugepage_files(struct hugepage_file *hugepg_tbl,
-               unsigned num_hp_info)
-{
-       unsigned socket, size;
-       int page, nrpages = 0;
-
-       /* get total number of hugepages */
-       for (size = 0; size < num_hp_info; size++)
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
-                       nrpages +=
-                       internal_config.hugepage_info[size].num_pages[socket];
-
-       for (page = 0; page < nrpages; page++) {
-               struct hugepage_file *hp = &hugepg_tbl[page];
-
-               if (hp->orig_va != NULL && unlink(hp->filepath)) {
-                       RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n",
-                               __func__, hp->filepath, strerror(errno));
-               }
-       }
-       return 0;
-}
-
-/*
- * unmaps hugepages that are not going to be used. since we originally allocate
- * ALL hugepages (not just those we need), additional unmapping needs to be done.
- */
-static int
-unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
-               struct hugepage_info *hpi,
-               unsigned num_hp_info)
-{
-       unsigned socket, size;
-       int page, nrpages = 0;
-
-       /* get total number of hugepages */
-       for (size = 0; size < num_hp_info; size++)
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
-                       nrpages += internal_config.hugepage_info[size].num_pages[socket];
-
-       for (size = 0; size < num_hp_info; size++) {
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
-                       unsigned pages_found = 0;
-
-                       /* traverse until we have unmapped all the unused pages */
-                       for (page = 0; page < nrpages; page++) {
-                               struct hugepage_file *hp = &hugepg_tbl[page];
-
-                               /* find a page that matches the criteria */
-                               if ((hp->size == hpi[size].hugepage_sz) &&
-                                               (hp->socket_id == (int) socket)) {
-
-                                       /* if we skipped enough pages, unmap the rest */
-                                       if (pages_found == hpi[size].num_pages[socket]) {
-                                               uint64_t unmap_len;
-
-                                               unmap_len = hp->size;
-
-                                               /* get start addr and len of the remaining segment */
-                                               munmap(hp->orig_va,
-                                                       (size_t)unmap_len);
-
-                                               hp->orig_va = NULL;
-                                               if (unlink(hp->filepath) == -1) {
-                                                       RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
-                                                                       __func__, hp->filepath, strerror(errno));
-                                                       return -1;
-                                               }
-                                       } else {
-                                               /* lock the page and skip */
-                                               pages_found++;
-                                       }
-
-                               } /* match page */
-                       } /* foreach page */
-               } /* foreach socket */
-       } /* foreach pagesize */
-
-       return 0;
-}
-
-static int
-remap_segment(struct hugepage_file *hugepages, int seg_start, int seg_end)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct rte_memseg_list *msl;
-       struct rte_fbarray *arr;
-       int cur_page, seg_len;
-       unsigned int msl_idx;
-       int ms_idx;
-       uint64_t page_sz;
-       size_t memseg_len;
-       int socket_id;
-
-       page_sz = hugepages[seg_start].size;
-       socket_id = hugepages[seg_start].socket_id;
-       seg_len = seg_end - seg_start;
-
-       RTE_LOG(DEBUG, EAL, "Attempting to map %" PRIu64 "M on socket %i\n",
-                       (seg_len * page_sz) >> 20ULL, socket_id);
-
-       /* find free space in memseg lists */
-       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
-               bool empty;
-               msl = &mcfg->memsegs[msl_idx];
-               arr = &msl->memseg_arr;
-
-               if (msl->page_sz != page_sz)
-                       continue;
-               if (msl->socket_id != socket_id)
-                       continue;
-
-               /* leave space for a hole if array is not empty */
-               empty = arr->count == 0;
-               ms_idx = rte_fbarray_find_next_n_free(arr, 0,
-                               seg_len + (empty ? 0 : 1));
-
-               /* memseg list is full? */
-               if (ms_idx < 0)
-                       continue;
-
-               /* leave some space between memsegs, they are not IOVA
-                * contiguous, so they shouldn't be VA contiguous either.
-                */
-               if (!empty)
-                       ms_idx++;
-               break;
-       }
-       if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
-               RTE_LOG(ERR, EAL, "Could not find space for memseg. Please increase %s and/or %s in configuration.\n",
-                               RTE_STR(CONFIG_RTE_MAX_MEMSEG_PER_TYPE),
-                               RTE_STR(CONFIG_RTE_MAX_MEM_PER_TYPE));
-               return -1;
-       }
-
-#ifdef RTE_ARCH_PPC64
-       /* for PPC64 we go through the list backwards */
-       for (cur_page = seg_end - 1; cur_page >= seg_start;
-                       cur_page--, ms_idx++) {
-#else
-       for (cur_page = seg_start; cur_page < seg_end; cur_page++, ms_idx++) {
-#endif
-               struct hugepage_file *hfile = &hugepages[cur_page];
-               struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
-               void *addr;
-               int fd;
-
-               fd = open(hfile->filepath, O_RDWR);
-               if (fd < 0) {
-                       RTE_LOG(ERR, EAL, "Could not open '%s': %s\n",
-                                       hfile->filepath, strerror(errno));
-                       return -1;
-               }
-               /* set shared lock on the file. */
-               if (flock(fd, LOCK_SH) < 0) {
-                       RTE_LOG(DEBUG, EAL, "Could not lock '%s': %s\n",
-                                       hfile->filepath, strerror(errno));
-                       close(fd);
-                       return -1;
-               }
-               memseg_len = (size_t)page_sz;
-               addr = RTE_PTR_ADD(msl->base_va, ms_idx * memseg_len);
-
-               /* we know this address is already mmapped by memseg list, so
-                * using MAP_FIXED here is safe
-                */
-               addr = mmap(addr, page_sz, PROT_READ | PROT_WRITE,
-                               MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, 0);
-               if (addr == MAP_FAILED) {
-                       RTE_LOG(ERR, EAL, "Couldn't remap '%s': %s\n",
-                                       hfile->filepath, strerror(errno));
-                       close(fd);
-                       return -1;
-               }
-
-               /* we have a new address, so unmap previous one */
-#ifndef RTE_ARCH_64
-               /* in 32-bit legacy mode, we have already unmapped the page */
-               if (!internal_config.legacy_mem)
-                       munmap(hfile->orig_va, page_sz);
-#else
-               munmap(hfile->orig_va, page_sz);
-#endif
-
-               hfile->orig_va = NULL;
-               hfile->final_va = addr;
-
-               /* rewrite physical addresses in IOVA as VA mode */
-               if (rte_eal_iova_mode() == RTE_IOVA_VA)
-                       hfile->physaddr = (uintptr_t)addr;
-
-               /* set up memseg data */
-               ms->addr = addr;
-               ms->hugepage_sz = page_sz;
-               ms->len = memseg_len;
-               ms->iova = hfile->physaddr;
-               ms->socket_id = hfile->socket_id;
-               ms->nchannel = rte_memory_get_nchannel();
-               ms->nrank = rte_memory_get_nrank();
-
-               rte_fbarray_set_used(arr, ms_idx);
-
-               /* store segment fd internally */
-               if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
-                       RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
-                               rte_strerror(rte_errno));
-       }
-       RTE_LOG(DEBUG, EAL, "Allocated %" PRIu64 "M on socket %i\n",
-                       (seg_len * page_sz) >> 20, socket_id);
-       return 0;
-}
-
-static uint64_t
-get_mem_amount(uint64_t page_sz, uint64_t max_mem)
-{
-       uint64_t area_sz, max_pages;
-
-       /* limit to RTE_MAX_MEMSEG_PER_LIST pages or RTE_MAX_MEM_MB_PER_LIST */
-       max_pages = RTE_MAX_MEMSEG_PER_LIST;
-       max_mem = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20, max_mem);
-
-       area_sz = RTE_MIN(page_sz * max_pages, max_mem);
-
-       /* make sure the list isn't smaller than the page size */
-       area_sz = RTE_MAX(area_sz, page_sz);
-
-       return RTE_ALIGN(area_sz, page_sz);
-}
-
-static int
-free_memseg_list(struct rte_memseg_list *msl)
-{
-       if (rte_fbarray_destroy(&msl->memseg_arr)) {
-               RTE_LOG(ERR, EAL, "Cannot destroy memseg list\n");
-               return -1;
-       }
-       memset(msl, 0, sizeof(*msl));
-       return 0;
-}
-
-#define MEMSEG_LIST_FMT "memseg-%" PRIu64 "k-%i-%i"
-static int
-alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
-               int n_segs, int socket_id, int type_msl_idx)
-{
-       char name[RTE_FBARRAY_NAME_LEN];
-
-       snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
-                type_msl_idx);
-       if (rte_fbarray_init(&msl->memseg_arr, name, n_segs,
-                       sizeof(struct rte_memseg))) {
-               RTE_LOG(ERR, EAL, "Cannot allocate memseg list: %s\n",
-                       rte_strerror(rte_errno));
-               return -1;
-       }
-
-       msl->page_sz = page_sz;
-       msl->socket_id = socket_id;
-       msl->base_va = NULL;
-
-       RTE_LOG(DEBUG, EAL, "Memseg list allocated: 0x%zxkB at socket %i\n",
-                       (size_t)page_sz >> 10, socket_id);
-
-       return 0;
-}
-
-static int
-alloc_va_space(struct rte_memseg_list *msl)
-{
-       uint64_t page_sz;
-       size_t mem_sz;
-       void *addr;
-       int flags = 0;
-
-       page_sz = msl->page_sz;
-       mem_sz = page_sz * msl->memseg_arr.len;
-
-       addr = eal_get_virtual_area(msl->base_va, &mem_sz, page_sz, 0, flags);
-       if (addr == NULL) {
-               if (rte_errno == EADDRNOTAVAIL)
-                       RTE_LOG(ERR, EAL, "Could not mmap %llu bytes at [%p] - please use '--base-virtaddr' option\n",
-                               (unsigned long long)mem_sz, msl->base_va);
-               else
-                       RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
-               return -1;
-       }
-       msl->base_va = addr;
-       msl->len = mem_sz;
-
-       return 0;
-}
-
-/*
- * Our VA space is not preallocated yet, so preallocate it here. We need to know
- * how many segments there are in order to map all pages into one address space,
- * and leave appropriate holes between segments so that rte_malloc does not
- * concatenate them into one big segment.
- *
- * we also need to unmap original pages to free up address space.
- */
-static int __rte_unused
-prealloc_segments(struct hugepage_file *hugepages, int n_pages)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       int cur_page, seg_start_page, end_seg, new_memseg;
-       unsigned int hpi_idx, socket, i;
-       int n_contig_segs, n_segs;
-       int msl_idx;
-
-       /* before we preallocate segments, we need to free up our VA space.
-        * we're not removing files, and we already have information about
-        * PA-contiguousness, so it is safe to unmap everything.
-        */
-       for (cur_page = 0; cur_page < n_pages; cur_page++) {
-               struct hugepage_file *hpi = &hugepages[cur_page];
-               munmap(hpi->orig_va, hpi->size);
-               hpi->orig_va = NULL;
-       }
-
-       /* we cannot know how many page sizes and sockets we have discovered, so
-        * loop over all of them
-        */
-       for (hpi_idx = 0; hpi_idx < internal_config.num_hugepage_sizes;
-                       hpi_idx++) {
-               uint64_t page_sz =
-                       internal_config.hugepage_info[hpi_idx].hugepage_sz;
-
-               for (i = 0; i < rte_socket_count(); i++) {
-                       struct rte_memseg_list *msl;
-
-                       socket = rte_socket_id_by_idx(i);
-                       n_contig_segs = 0;
-                       n_segs = 0;
-                       seg_start_page = -1;
-
-                       for (cur_page = 0; cur_page < n_pages; cur_page++) {
-                               struct hugepage_file *prev, *cur;
-                               int prev_seg_start_page = -1;
-
-                               cur = &hugepages[cur_page];
-                               prev = cur_page == 0 ? NULL :
-                                               &hugepages[cur_page - 1];
-
-                               new_memseg = 0;
-                               end_seg = 0;
-
-                               if (cur->size == 0)
-                                       end_seg = 1;
-                               else if (cur->socket_id != (int) socket)
-                                       end_seg = 1;
-                               else if (cur->size != page_sz)
-                                       end_seg = 1;
-                               else if (cur_page == 0)
-                                       new_memseg = 1;
-#ifdef RTE_ARCH_PPC_64
-                               /* On PPC64 architecture, the mmap always start
-                                * from higher address to lower address. Here,
-                                * physical addresses are in descending order.
-                                */
-                               else if ((prev->physaddr - cur->physaddr) !=
-                                               cur->size)
-                                       new_memseg = 1;
-#else
-                               else if ((cur->physaddr - prev->physaddr) !=
-                                               cur->size)
-                                       new_memseg = 1;
-#endif
-                               if (new_memseg) {
-                                       /* if we're already inside a segment,
-                                        * new segment means end of current one
-                                        */
-                                       if (seg_start_page != -1) {
-                                               end_seg = 1;
-                                               prev_seg_start_page =
-                                                               seg_start_page;
-                                       }
-                                       seg_start_page = cur_page;
-                               }
-
-                               if (end_seg) {
-                                       if (prev_seg_start_page != -1) {
-                                               /* we've found a new segment */
-                                               n_contig_segs++;
-                                               n_segs += cur_page -
-                                                       prev_seg_start_page;
-                                       } else if (seg_start_page != -1) {
-                                               /* we didn't find new segment,
-                                                * but did end current one
-                                                */
-                                               n_contig_segs++;
-                                               n_segs += cur_page -
-                                                               seg_start_page;
-                                               seg_start_page = -1;
-                                               continue;
-                                       } else {
-                                               /* we're skipping this page */
-                                               continue;
-                                       }
-                               }
-                               /* segment continues */
-                       }
-                       /* check if we missed last segment */
-                       if (seg_start_page != -1) {
-                               n_contig_segs++;
-                               n_segs += cur_page - seg_start_page;
-                       }
-
-                       /* if no segments were found, do not preallocate */
-                       if (n_segs == 0)
-                               continue;
-
-                       /* we now have total number of pages that we will
-                        * allocate for this segment list. add separator pages
-                        * to the total count, and preallocate VA space.
-                        */
-                       n_segs += n_contig_segs - 1;
-
-                       /* now, preallocate VA space for these segments */
-
-                       /* first, find suitable memseg list for this */
-                       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS;
-                                       msl_idx++) {
-                               msl = &mcfg->memsegs[msl_idx];
-
-                               if (msl->base_va != NULL)
-                                       continue;
-                               break;
-                       }
-                       if (msl_idx == RTE_MAX_MEMSEG_LISTS) {
-                               RTE_LOG(ERR, EAL, "Not enough space in memseg lists, please increase %s\n",
-                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
-                               return -1;
-                       }
-
-                       /* now, allocate fbarray itself */
-                       if (alloc_memseg_list(msl, page_sz, n_segs, socket,
-                                               msl_idx) < 0)
-                               return -1;
-
-                       /* finally, allocate VA space */
-                       if (alloc_va_space(msl) < 0)
-                               return -1;
-               }
-       }
-       return 0;
-}
-
-/*
- * We cannot reallocate memseg lists on the fly because PPC64 stores pages
- * backwards, therefore we have to process the entire memseg first before
- * remapping it into memseg list VA space.
- */
-static int
-remap_needed_hugepages(struct hugepage_file *hugepages, int n_pages)
-{
-       int cur_page, seg_start_page, new_memseg, ret;
-
-       seg_start_page = 0;
-       for (cur_page = 0; cur_page < n_pages; cur_page++) {
-               struct hugepage_file *prev, *cur;
-
-               new_memseg = 0;
-
-               cur = &hugepages[cur_page];
-               prev = cur_page == 0 ? NULL : &hugepages[cur_page - 1];
-
-               /* if size is zero, no more pages left */
-               if (cur->size == 0)
-                       break;
-
-               if (cur_page == 0)
-                       new_memseg = 1;
-               else if (cur->socket_id != prev->socket_id)
-                       new_memseg = 1;
-               else if (cur->size != prev->size)
-                       new_memseg = 1;
-#ifdef RTE_ARCH_PPC_64
-               /* On PPC64 architecture, the mmap always start from higher
-                * address to lower address. Here, physical addresses are in
-                * descending order.
-                */
-               else if ((prev->physaddr - cur->physaddr) != cur->size)
-                       new_memseg = 1;
-#else
-               else if ((cur->physaddr - prev->physaddr) != cur->size)
-                       new_memseg = 1;
-#endif
-
-               if (new_memseg) {
-                       /* if this isn't the first time, remap segment */
-                       if (cur_page != 0) {
-                               ret = remap_segment(hugepages, seg_start_page,
-                                               cur_page);
-                               if (ret != 0)
-                                       return -1;
-                       }
-                       /* remember where we started */
-                       seg_start_page = cur_page;
-               }
-               /* continuation of previous memseg */
-       }
-       /* we were stopped, but we didn't remap the last segment, do it now */
-       if (cur_page != 0) {
-               ret = remap_segment(hugepages, seg_start_page,
-                               cur_page);
-               if (ret != 0)
-                       return -1;
-       }
-       return 0;
-}
-
-static inline uint64_t
-get_socket_mem_size(int socket)
-{
-       uint64_t size = 0;
-       unsigned i;
-
-       for (i = 0; i < internal_config.num_hugepage_sizes; i++){
-               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-               size += hpi->hugepage_sz * hpi->num_pages[socket];
-       }
-
-       return size;
-}
-
-/*
- * This function is a NUMA-aware equivalent of calc_num_pages.
- * It takes in the list of hugepage sizes and the
- * number of pages thereof, and calculates the best number of
- * pages of each size to fulfill the request for <memory> ram
- */
-static int
-calc_num_pages_per_socket(uint64_t * memory,
-               struct hugepage_info *hp_info,
-               struct hugepage_info *hp_used,
-               unsigned num_hp_info)
-{
-       unsigned socket, j, i = 0;
-       unsigned requested, available;
-       int total_num_pages = 0;
-       uint64_t remaining_mem, cur_mem;
-       uint64_t total_mem = internal_config.memory;
-
-       if (num_hp_info == 0)
-               return -1;
-
-       /* if specific memory amounts per socket weren't requested */
-       if (internal_config.force_sockets == 0) {
-               size_t total_size;
-#ifdef RTE_ARCH_64
-               int cpu_per_socket[RTE_MAX_NUMA_NODES];
-               size_t default_size;
-               unsigned lcore_id;
-
-               /* Compute number of cores per socket */
-               memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
-               RTE_LCORE_FOREACH(lcore_id) {
-                       cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
-               }
-
-               /*
-                * Automatically spread requested memory amongst detected sockets according
-                * to number of cores from cpu mask present on each socket
-                */
-               total_size = internal_config.memory;
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
-
-                       /* Set memory amount per socket */
-                       default_size = (internal_config.memory * cpu_per_socket[socket])
-                                       / rte_lcore_count();
-
-                       /* Limit to maximum available memory on socket */
-                       default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
-
-                       /* Update sizes */
-                       memory[socket] = default_size;
-                       total_size -= default_size;
-               }
-
-               /*
-                * If some memory is remaining, try to allocate it by getting all
-                * available memory from sockets, one after the other
-                */
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
-                       /* take whatever is available */
-                       default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
-                                              total_size);
-
-                       /* Update sizes */
-                       memory[socket] += default_size;
-                       total_size -= default_size;
-               }
-#else
-               /* in 32-bit mode, allocate all of the memory only on master
-                * lcore socket
-                */
-               total_size = internal_config.memory;
-               for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
-                               socket++) {
-                       struct rte_config *cfg = rte_eal_get_configuration();
-                       unsigned int master_lcore_socket;
-
-                       master_lcore_socket =
-                               rte_lcore_to_socket_id(cfg->master_lcore);
-
-                       if (master_lcore_socket != socket)
-                               continue;
-
-                       /* Update sizes */
-                       memory[socket] = total_size;
-                       break;
-               }
-#endif
-       }
-
-       for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
-               /* skips if the memory on specific socket wasn't requested */
-               for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
-                       strlcpy(hp_used[i].hugedir, hp_info[i].hugedir,
-                               sizeof(hp_used[i].hugedir));
-                       hp_used[i].num_pages[socket] = RTE_MIN(
-                                       memory[socket] / hp_info[i].hugepage_sz,
-                                       hp_info[i].num_pages[socket]);
-
-                       cur_mem = hp_used[i].num_pages[socket] *
-                                       hp_used[i].hugepage_sz;
-
-                       memory[socket] -= cur_mem;
-                       total_mem -= cur_mem;
-
-                       total_num_pages += hp_used[i].num_pages[socket];
-
-                       /* check if we have met all memory requests */
-                       if (memory[socket] == 0)
-                               break;
-
-                       /* check if we have any more pages left at this size, if so
-                        * move on to next size */
-                       if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
-                               continue;
-                       /* At this point we know that there are more pages available that are
-                        * bigger than the memory we want, so lets see if we can get enough
-                        * from other page sizes.
-                        */
-                       remaining_mem = 0;
-                       for (j = i+1; j < num_hp_info; j++)
-                               remaining_mem += hp_info[j].hugepage_sz *
-                               hp_info[j].num_pages[socket];
-
-                       /* is there enough other memory, if not allocate another page and quit */
-                       if (remaining_mem < memory[socket]){
-                               cur_mem = RTE_MIN(memory[socket],
-                                               hp_info[i].hugepage_sz);
-                               memory[socket] -= cur_mem;
-                               total_mem -= cur_mem;
-                               hp_used[i].num_pages[socket]++;
-                               total_num_pages++;
-                               break; /* we are done with this socket*/
-                       }
-               }
-               /* if we didn't satisfy all memory requirements per socket */
-               if (memory[socket] > 0 &&
-                               internal_config.socket_mem[socket] != 0) {
-                       /* to prevent icc errors */
-                       requested = (unsigned) (internal_config.socket_mem[socket] /
-                                       0x100000);
-                       available = requested -
-                                       ((unsigned) (memory[socket] / 0x100000));
-                       RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! "
-                                       "Requested: %uMB, available: %uMB\n", socket,
-                                       requested, available);
-                       return -1;
-               }
-       }
-
-       /* if we didn't satisfy total memory requirements */
-       if (total_mem > 0) {
-               requested = (unsigned) (internal_config.memory / 0x100000);
-               available = requested - (unsigned) (total_mem / 0x100000);
-               RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB,"
-                               " available: %uMB\n", requested, available);
-               return -1;
-       }
-       return total_num_pages;
-}
-
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
-       uint64_t size = 0;
-       unsigned i, j;
-
-       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
-               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0) {
-                       for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
-                               size += hpi->hugepage_sz * hpi->num_pages[j];
-                       }
-               }
-       }
-
-       return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
-static struct sigaction huge_action_old;
-static int huge_need_recover;
-
-static void
-huge_register_sigbus(void)
-{
-       sigset_t mask;
-       struct sigaction action;
-
-       sigemptyset(&mask);
-       sigaddset(&mask, SIGBUS);
-       action.sa_flags = 0;
-       action.sa_mask = mask;
-       action.sa_handler = huge_sigbus_handler;
-
-       huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
-}
-
-static void
-huge_recover_sigbus(void)
-{
-       if (huge_need_recover) {
-               sigaction(SIGBUS, &huge_action_old, NULL);
-               huge_need_recover = 0;
-       }
-}
-
-/*
- * Prepare physical memory mapping: fill configuration structure with
- * these infos, return 0 on success.
- *  1. map N huge pages in separate files in hugetlbfs
- *  2. find associated physical addr
- *  3. find associated NUMA socket ID
- *  4. sort all huge pages by physical address
- *  5. remap these N huge pages in the correct order
- *  6. unmap the first mapping
- *  7. fill memsegs in configuration with contiguous zones
- */
-static int
-eal_legacy_hugepage_init(void)
-{
-       struct rte_mem_config *mcfg;
-       struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
-       struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
-       struct rte_fbarray *arr;
-       struct rte_memseg *ms;
-
-       uint64_t memory[RTE_MAX_NUMA_NODES];
-
-       unsigned hp_offset;
-       int i, j;
-       int nr_hugefiles, nr_hugepages = 0;
-       void *addr;
-
-       test_phys_addrs_available();
-
-       memset(used_hp, 0, sizeof(used_hp));
-
-       /* get pointer to global configuration */
-       mcfg = rte_eal_get_configuration()->mem_config;
-
-       /* hugetlbfs can be disabled */
-       if (internal_config.no_hugetlbfs) {
-               struct rte_memseg_list *msl;
-               int n_segs, cur_seg, fd, flags;
-#ifdef MEMFD_SUPPORTED
-               int memfd;
-#endif
-               uint64_t page_sz;
-
-               /* nohuge mode is legacy mode */
-               internal_config.legacy_mem = 1;
-
-               /* nohuge mode is single-file segments mode */
-               internal_config.single_file_segments = 1;
-
-               /* create a memseg list */
-               msl = &mcfg->memsegs[0];
-
-               page_sz = RTE_PGSIZE_4K;
-               n_segs = internal_config.memory / page_sz;
-
-               if (rte_fbarray_init(&msl->memseg_arr, "nohugemem", n_segs,
-                                       sizeof(struct rte_memseg))) {
-                       RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
-                       return -1;
-               }
-
-               /* set up parameters for anonymous mmap */
-               fd = -1;
-               flags = MAP_PRIVATE | MAP_ANONYMOUS;
-
-#ifdef MEMFD_SUPPORTED
-               /* create a memfd and store it in the segment fd table */
-               memfd = memfd_create("nohuge", 0);
-               if (memfd < 0) {
-                       RTE_LOG(DEBUG, EAL, "Cannot create memfd: %s\n",
-                                       strerror(errno));
-                       RTE_LOG(DEBUG, EAL, "Falling back to anonymous map\n");
-               } else {
-                       /* we got an fd - now resize it */
-                       if (ftruncate(memfd, internal_config.memory) < 0) {
-                               RTE_LOG(ERR, EAL, "Cannot resize memfd: %s\n",
-                                               strerror(errno));
-                               RTE_LOG(ERR, EAL, "Falling back to anonymous map\n");
-                               close(memfd);
-                       } else {
-                               /* creating memfd-backed file was successful.
-                                * we want changes to memfd to be visible to
-                                * other processes (such as vhost backend), so
-                                * map it as shared memory.
-                                */
-                               RTE_LOG(DEBUG, EAL, "Using memfd for anonymous memory\n");
-                               fd = memfd;
-                               flags = MAP_SHARED;
-                       }
-               }
-#endif
-               addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
-                               flags, fd, 0);
-               if (addr == MAP_FAILED) {
-                       RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
-                                       strerror(errno));
-                       return -1;
-               }
-               msl->base_va = addr;
-               msl->page_sz = page_sz;
-               msl->socket_id = 0;
-               msl->len = internal_config.memory;
-
-               /* we're in single-file segments mode, so only the segment list
-                * fd needs to be set up.
-                */
-               if (fd != -1) {
-                       if (eal_memalloc_set_seg_list_fd(0, fd) < 0) {
-                               RTE_LOG(ERR, EAL, "Cannot set up segment list fd\n");
-                               /* not a serious error, proceed */
-                       }
-               }
-
-               /* populate memsegs. each memseg is one page long */
-               for (cur_seg = 0; cur_seg < n_segs; cur_seg++) {
-                       arr = &msl->memseg_arr;
-
-                       ms = rte_fbarray_get(arr, cur_seg);
-                       if (rte_eal_iova_mode() == RTE_IOVA_VA)
-                               ms->iova = (uintptr_t)addr;
-                       else
-                               ms->iova = RTE_BAD_IOVA;
-                       ms->addr = addr;
-                       ms->hugepage_sz = page_sz;
-                       ms->socket_id = 0;
-                       ms->len = page_sz;
-
-                       rte_fbarray_set_used(arr, cur_seg);
-
-                       addr = RTE_PTR_ADD(addr, (size_t)page_sz);
-               }
-               if (mcfg->dma_maskbits &&
-                   rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
-                       RTE_LOG(ERR, EAL,
-                               "%s(): couldnt allocate memory due to IOVA exceeding limits of current DMA mask.\n",
-                               __func__);
-                       if (rte_eal_iova_mode() == RTE_IOVA_VA &&
-                           rte_eal_using_phys_addrs())
-                               RTE_LOG(ERR, EAL,
-                                       "%s(): Please try initializing EAL with --iova-mode=pa parameter.\n",
-                                       __func__);
-                       goto fail;
-               }
-               return 0;
-       }
-
-       /* calculate total number of hugepages available. at this point we haven't
-        * yet started sorting them so they all are on socket 0 */
-       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
-               /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
-               used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
-
-               nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
-       }
-
-       /*
-        * allocate a memory area for hugepage table.
-        * this isn't shared memory yet. due to the fact that we need some
-        * processing done on these pages, shared memory will be created
-        * at a later stage.
-        */
-       tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
-       if (tmp_hp == NULL)
-               goto fail;
-
-       memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
-
-       hp_offset = 0; /* where we start the current page size entries */
-
-       huge_register_sigbus();
-
-       /* make a copy of socket_mem, needed for balanced allocation. */
-       for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
-               memory[i] = internal_config.socket_mem[i];
-
-       /* map all hugepages and sort them */
-       for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
-               unsigned pages_old, pages_new;
-               struct hugepage_info *hpi;
-
-               /*
-                * we don't yet mark hugepages as used at this stage, so
-                * we just map all hugepages available to the system
-                * all hugepages are still located on socket 0
-                */
-               hpi = &internal_config.hugepage_info[i];
-
-               if (hpi->num_pages[0] == 0)
-                       continue;
-
-               /* map all hugepages available */
-               pages_old = hpi->num_pages[0];
-               pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, memory);
-               if (pages_new < pages_old) {
-                       RTE_LOG(DEBUG, EAL,
-                               "%d not %d hugepages of size %u MB allocated\n",
-                               pages_new, pages_old,
-                               (unsigned)(hpi->hugepage_sz / 0x100000));
-
-                       int pages = pages_old - pages_new;
-
-                       nr_hugepages -= pages;
-                       hpi->num_pages[0] = pages_new;
-                       if (pages_new == 0)
-                               continue;
-               }
-
-               if (phys_addrs_available &&
-                               rte_eal_iova_mode() != RTE_IOVA_VA) {
-                       /* find physical addresses for each hugepage */
-                       if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
-                               RTE_LOG(DEBUG, EAL, "Failed to find phys addr "
-                                       "for %u MB pages\n",
-                                       (unsigned int)(hpi->hugepage_sz / 0x100000));
-                               goto fail;
-                       }
-               } else {
-                       /* set physical addresses for each hugepage */
-                       if (set_physaddrs(&tmp_hp[hp_offset], hpi) < 0) {
-                               RTE_LOG(DEBUG, EAL, "Failed to set phys addr "
-                                       "for %u MB pages\n",
-                                       (unsigned int)(hpi->hugepage_sz / 0x100000));
-                               goto fail;
-                       }
-               }
-
-               if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
-                       RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
-                                       (unsigned)(hpi->hugepage_sz / 0x100000));
-                       goto fail;
-               }
-
-               qsort(&tmp_hp[hp_offset], hpi->num_pages[0],
-                     sizeof(struct hugepage_file), cmp_physaddr);
-
-               /* we have processed a num of hugepages of this size, so inc offset */
-               hp_offset += hpi->num_pages[0];
-       }
-
-       huge_recover_sigbus();
-
-       if (internal_config.memory == 0 && internal_config.force_sockets == 0)
-               internal_config.memory = eal_get_hugepage_mem_size();
-
-       nr_hugefiles = nr_hugepages;
-
-
-       /* clean out the numbers of pages */
-       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
-               for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
-                       internal_config.hugepage_info[i].num_pages[j] = 0;
-
-       /* get hugepages for each socket */
-       for (i = 0; i < nr_hugefiles; i++) {
-               int socket = tmp_hp[i].socket_id;
-
-               /* find a hugepage info with right size and increment num_pages */
-               const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES,
-                               (int)internal_config.num_hugepage_sizes);
-               for (j = 0; j < nb_hpsizes; j++) {
-                       if (tmp_hp[i].size ==
-                                       internal_config.hugepage_info[j].hugepage_sz) {
-                               internal_config.hugepage_info[j].num_pages[socket]++;
-                       }
-               }
-       }
-
-       /* make a copy of socket_mem, needed for number of pages calculation */
-       for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
-               memory[i] = internal_config.socket_mem[i];
-
-       /* calculate final number of pages */
-       nr_hugepages = calc_num_pages_per_socket(memory,
-                       internal_config.hugepage_info, used_hp,
-                       internal_config.num_hugepage_sizes);
-
-       /* error if not enough memory available */
-       if (nr_hugepages < 0)
-               goto fail;
-
-       /* reporting in! */
-       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
-               for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
-                       if (used_hp[i].num_pages[j] > 0) {
-                               RTE_LOG(DEBUG, EAL,
-                                       "Requesting %u pages of size %uMB"
-                                       " from socket %i\n",
-                                       used_hp[i].num_pages[j],
-                                       (unsigned)
-                                       (used_hp[i].hugepage_sz / 0x100000),
-                                       j);
-                       }
-               }
-       }
-
-       /* create shared memory */
-       hugepage = create_shared_memory(eal_hugepage_data_path(),
-                       nr_hugefiles * sizeof(struct hugepage_file));
-
-       if (hugepage == NULL) {
-               RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
-               goto fail;
-       }
-       memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
-
-       /*
-        * unmap pages that we won't need (looks at used_hp).
-        * also, sets final_va to NULL on pages that were unmapped.
-        */
-       if (unmap_unneeded_hugepages(tmp_hp, used_hp,
-                       internal_config.num_hugepage_sizes) < 0) {
-               RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
-               goto fail;
-       }
-
-       /*
-        * copy stuff from malloc'd hugepage* to the actual shared memory.
-        * this procedure only copies those hugepages that have orig_va
-        * not NULL. has overflow protection.
-        */
-       if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
-                       tmp_hp, nr_hugefiles) < 0) {
-               RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
-               goto fail;
-       }
-
-#ifndef RTE_ARCH_64
-       /* for legacy 32-bit mode, we did not preallocate VA space, so do it */
-       if (internal_config.legacy_mem &&
-                       prealloc_segments(hugepage, nr_hugefiles)) {
-               RTE_LOG(ERR, EAL, "Could not preallocate VA space for hugepages\n");
-               goto fail;
-       }
-#endif
-
-       /* remap all pages we do need into memseg list VA space, so that those
-        * pages become first-class citizens in DPDK memory subsystem
-        */
-       if (remap_needed_hugepages(hugepage, nr_hugefiles)) {
-               RTE_LOG(ERR, EAL, "Couldn't remap hugepage files into memseg lists\n");
-               goto fail;
-       }
-
-       /* free the hugepage backing files */
-       if (internal_config.hugepage_unlink &&
-               unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) {
-               RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n");
-               goto fail;
-       }
-
-       /* free the temporary hugepage table */
-       free(tmp_hp);
-       tmp_hp = NULL;
-
-       munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-       hugepage = NULL;
-
-       /* we're not going to allocate more pages, so release VA space for
-        * unused memseg lists
-        */
-       for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
-               struct rte_memseg_list *msl = &mcfg->memsegs[i];
-               size_t mem_sz;
-
-               /* skip inactive lists */
-               if (msl->base_va == NULL)
-                       continue;
-               /* skip lists where there is at least one page allocated */
-               if (msl->memseg_arr.count > 0)
-                       continue;
-               /* this is an unused list, deallocate it */
-               mem_sz = msl->len;
-               munmap(msl->base_va, mem_sz);
-               msl->base_va = NULL;
-
-               /* destroy backing fbarray */
-               rte_fbarray_destroy(&msl->memseg_arr);
-       }
-
-       if (mcfg->dma_maskbits &&
-           rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
-               RTE_LOG(ERR, EAL,
-                       "%s(): couldn't allocate memory due to IOVA exceeding limits of current DMA mask.\n",
-                       __func__);
-               goto fail;
-       }
-
-       return 0;
-
-fail:
-       huge_recover_sigbus();
-       free(tmp_hp);
-       if (hugepage != NULL)
-               munmap(hugepage, nr_hugefiles * sizeof(struct hugepage_file));
-
-       return -1;
-}
-
-static int __rte_unused
-hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
-{
-       struct hugepage_info *hpi = arg;
-
-       if (msl->page_sz != hpi->hugepage_sz)
-               return 0;
-
-       hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
-       return 0;
-}
-
-static int
-limits_callback(int socket_id, size_t cur_limit, size_t new_len)
-{
-       RTE_SET_USED(socket_id);
-       RTE_SET_USED(cur_limit);
-       RTE_SET_USED(new_len);
-       return -1;
-}
-
-static int
-eal_hugepage_init(void)
-{
-       struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
-       uint64_t memory[RTE_MAX_NUMA_NODES];
-       int hp_sz_idx, socket_id;
-
-       test_phys_addrs_available();
-
-       memset(used_hp, 0, sizeof(used_hp));
-
-       for (hp_sz_idx = 0;
-                       hp_sz_idx < (int) internal_config.num_hugepage_sizes;
-                       hp_sz_idx++) {
-#ifndef RTE_ARCH_64
-               struct hugepage_info dummy;
-               unsigned int i;
-#endif
-               /* also initialize used_hp hugepage sizes in used_hp */
-               struct hugepage_info *hpi;
-               hpi = &internal_config.hugepage_info[hp_sz_idx];
-               used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
-
-#ifndef RTE_ARCH_64
-               /* for 32-bit, limit number of pages on socket to whatever we've
-                * preallocated, as we cannot allocate more.
-                */
-               memset(&dummy, 0, sizeof(dummy));
-               dummy.hugepage_sz = hpi->hugepage_sz;
-               if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
-                       return -1;
-
-               for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
-                       hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
-                                       dummy.num_pages[i]);
-               }
-#endif
-       }
-
-       /* make a copy of socket_mem, needed for balanced allocation. */
-       for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
-               memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
-
-       /* calculate final number of pages */
-       if (calc_num_pages_per_socket(memory,
-                       internal_config.hugepage_info, used_hp,
-                       internal_config.num_hugepage_sizes) < 0)
-               return -1;
-
-       for (hp_sz_idx = 0;
-                       hp_sz_idx < (int)internal_config.num_hugepage_sizes;
-                       hp_sz_idx++) {
-               for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
-                               socket_id++) {
-                       struct rte_memseg **pages;
-                       struct hugepage_info *hpi = &used_hp[hp_sz_idx];
-                       unsigned int num_pages = hpi->num_pages[socket_id];
-                       int num_pages_alloc, i;
-
-                       if (num_pages == 0)
-                               continue;
-
-                       pages = malloc(sizeof(*pages) * num_pages);
-
-                       RTE_LOG(DEBUG, EAL, "Allocating %u pages of size %" PRIu64 "M on socket %i\n",
-                               num_pages, hpi->hugepage_sz >> 20, socket_id);
-
-                       num_pages_alloc = eal_memalloc_alloc_seg_bulk(pages,
-                                       num_pages, hpi->hugepage_sz,
-                                       socket_id, true);
-                       if (num_pages_alloc < 0) {
-                               free(pages);
-                               return -1;
-                       }
-
-                       /* mark preallocated pages as unfreeable */
-                       for (i = 0; i < num_pages_alloc; i++) {
-                               struct rte_memseg *ms = pages[i];
-                               ms->flags |= RTE_MEMSEG_FLAG_DO_NOT_FREE;
-                       }
-                       free(pages);
-               }
-       }
-       /* if socket limits were specified, set them */
-       if (internal_config.force_socket_limits) {
-               unsigned int i;
-               for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
-                       uint64_t limit = internal_config.socket_limit[i];
-                       if (limit == 0)
-                               continue;
-                       if (rte_mem_alloc_validator_register("socket-limit",
-                                       limits_callback, i, limit))
-                               RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
-               }
-       }
-       return 0;
-}
-
-/*
- * uses fstat to report the size of a file on disk
- */
-static off_t
-getFileSize(int fd)
-{
-       struct stat st;
-       if (fstat(fd, &st) < 0)
-               return 0;
-       return st.st_size;
-}
-
-/*
- * This creates the memory mappings in the secondary process to match that of
- * the server process. It goes through each memory segment in the DPDK runtime
- * configuration and finds the hugepages which form that segment, mapping them
- * in order to form a contiguous block in the virtual memory space
- */
-static int
-eal_legacy_hugepage_attach(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct hugepage_file *hp = NULL;
-       unsigned int num_hp = 0;
-       unsigned int i = 0;
-       unsigned int cur_seg;
-       off_t size = 0;
-       int fd, fd_hugepage = -1;
-
-       if (aslr_enabled() > 0) {
-               RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
-                               "(ASLR) is enabled in the kernel.\n");
-               RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
-                               "into secondary processes\n");
-       }
-
-       test_phys_addrs_available();
-
-       fd_hugepage = open(eal_hugepage_data_path(), O_RDONLY);
-       if (fd_hugepage < 0) {
-               RTE_LOG(ERR, EAL, "Could not open %s\n",
-                               eal_hugepage_data_path());
-               goto error;
-       }
-
-       size = getFileSize(fd_hugepage);
-       hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
-       if (hp == MAP_FAILED) {
-               RTE_LOG(ERR, EAL, "Could not mmap %s\n",
-                               eal_hugepage_data_path());
-               goto error;
-       }
-
-       num_hp = size / sizeof(struct hugepage_file);
-       RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
-
-       /* map all segments into memory to make sure we get the addrs. the
-        * segments themselves are already in memseg list (which is shared and
-        * has its VA space already preallocated), so we just need to map
-        * everything into correct addresses.
-        */
-       for (i = 0; i < num_hp; i++) {
-               struct hugepage_file *hf = &hp[i];
-               size_t map_sz = hf->size;
-               void *map_addr = hf->final_va;
-               int msl_idx, ms_idx;
-               struct rte_memseg_list *msl;
-               struct rte_memseg *ms;
-
-               /* if size is zero, no more pages left */
-               if (map_sz == 0)
-                       break;
-
-               fd = open(hf->filepath, O_RDWR);
-               if (fd < 0) {
-                       RTE_LOG(ERR, EAL, "Could not open %s: %s\n",
-                               hf->filepath, strerror(errno));
-                       goto error;
-               }
-
-               map_addr = mmap(map_addr, map_sz, PROT_READ | PROT_WRITE,
-                               MAP_SHARED | MAP_FIXED, fd, 0);
-               if (map_addr == MAP_FAILED) {
-                       RTE_LOG(ERR, EAL, "Could not map %s: %s\n",
-                               hf->filepath, strerror(errno));
-                       goto fd_error;
-               }
-
-               /* set shared lock on the file. */
-               if (flock(fd, LOCK_SH) < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n",
-                               __func__, strerror(errno));
-                       goto fd_error;
-               }
-
-               /* find segment data */
-               msl = rte_mem_virt2memseg_list(map_addr);
-               if (msl == NULL) {
-                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n",
-                               __func__);
-                       goto fd_error;
-               }
-               ms = rte_mem_virt2memseg(map_addr, msl);
-               if (ms == NULL) {
-                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n",
-                               __func__);
-                       goto fd_error;
-               }
-
-               msl_idx = msl - mcfg->memsegs;
-               ms_idx = rte_fbarray_find_idx(&msl->memseg_arr, ms);
-               if (ms_idx < 0) {
-                       RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n",
-                               __func__);
-                       goto fd_error;
-               }
-
-               /* store segment fd internally */
-               if (eal_memalloc_set_seg_fd(msl_idx, ms_idx, fd) < 0)
-                       RTE_LOG(ERR, EAL, "Could not store segment fd: %s\n",
-                               rte_strerror(rte_errno));
-       }
-       /* unmap the hugepage config file, since we are done using it */
-       munmap(hp, size);
-       close(fd_hugepage);
-       return 0;
-
-fd_error:
-       close(fd);
-error:
-       /* map all segments into memory to make sure we get the addrs */
-       cur_seg = 0;
-       for (cur_seg = 0; cur_seg < i; cur_seg++) {
-               struct hugepage_file *hf = &hp[i];
-               size_t map_sz = hf->size;
-               void *map_addr = hf->final_va;
-
-               munmap(map_addr, map_sz);
-       }
-       if (hp != NULL && hp != MAP_FAILED)
-               munmap(hp, size);
-       if (fd_hugepage >= 0)
-               close(fd_hugepage);
-       return -1;
-}
-
-static int
-eal_hugepage_attach(void)
-{
-       if (eal_memalloc_sync_with_primary()) {
-               RTE_LOG(ERR, EAL, "Could not map memory from primary process\n");
-               if (aslr_enabled() > 0)
-                       RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel and retry running both primary and secondary processes\n");
-               return -1;
-       }
-       return 0;
-}
-
-int
-rte_eal_hugepage_init(void)
-{
-       return internal_config.legacy_mem ?
-                       eal_legacy_hugepage_init() :
-                       eal_hugepage_init();
-}
-
-int
-rte_eal_hugepage_attach(void)
-{
-       return internal_config.legacy_mem ?
-                       eal_legacy_hugepage_attach() :
-                       eal_hugepage_attach();
-}
-
-int
-rte_eal_using_phys_addrs(void)
-{
-       return phys_addrs_available;
-}
-
-static int __rte_unused
-memseg_primary_init_32(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       int active_sockets, hpi_idx, msl_idx = 0;
-       unsigned int socket_id, i;
-       struct rte_memseg_list *msl;
-       uint64_t extra_mem_per_socket, total_extra_mem, total_requested_mem;
-       uint64_t max_mem;
-
-       /* no-huge does not need this at all */
-       if (internal_config.no_hugetlbfs)
-               return 0;
-
-       /* this is a giant hack, but desperate times call for desperate
-        * measures. in legacy 32-bit mode, we cannot preallocate VA space,
-        * because having upwards of 2 gigabytes of VA space already mapped will
-        * interfere with our ability to map and sort hugepages.
-        *
-        * therefore, in legacy 32-bit mode, we will be initializing memseg
-        * lists much later - in eal_memory.c, right after we unmap all the
-        * unneeded pages. this will not affect secondary processes, as those
-        * should be able to mmap the space without (too many) problems.
-        */
-       if (internal_config.legacy_mem)
-               return 0;
-
-       /* 32-bit mode is a very special case. we cannot know in advance where
-        * the user will want to allocate their memory, so we have to do some
-        * heuristics.
-        */
-       active_sockets = 0;
-       total_requested_mem = 0;
-       if (internal_config.force_sockets)
-               for (i = 0; i < rte_socket_count(); i++) {
-                       uint64_t mem;
-
-                       socket_id = rte_socket_id_by_idx(i);
-                       mem = internal_config.socket_mem[socket_id];
-
-                       if (mem == 0)
-                               continue;
-
-                       active_sockets++;
-                       total_requested_mem += mem;
-               }
-       else
-               total_requested_mem = internal_config.memory;
-
-       max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
-       if (total_requested_mem > max_mem) {
-               RTE_LOG(ERR, EAL, "Invalid parameters: 32-bit process can at most use %uM of memory\n",
-                               (unsigned int)(max_mem >> 20));
-               return -1;
-       }
-       total_extra_mem = max_mem - total_requested_mem;
-       extra_mem_per_socket = active_sockets == 0 ? total_extra_mem :
-                       total_extra_mem / active_sockets;
-
-       /* the allocation logic is a little bit convoluted, but here's how it
-        * works, in a nutshell:
-        *  - if user hasn't specified on which sockets to allocate memory via
-        *    --socket-mem, we allocate all of our memory on master core socket.
-        *  - if user has specified sockets to allocate memory on, there may be
-        *    some "unused" memory left (e.g. if user has specified --socket-mem
-        *    such that not all memory adds up to 2 gigabytes), so add it to all
-        *    sockets that are in use equally.
-        *
-        * page sizes are sorted by size in descending order, so we can safely
-        * assume that we dispense with bigger page sizes first.
-        */
-
-       /* create memseg lists */
-       for (i = 0; i < rte_socket_count(); i++) {
-               int hp_sizes = (int) internal_config.num_hugepage_sizes;
-               uint64_t max_socket_mem, cur_socket_mem;
-               unsigned int master_lcore_socket;
-               struct rte_config *cfg = rte_eal_get_configuration();
-               bool skip;
-
-               socket_id = rte_socket_id_by_idx(i);
-
-#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
-               if (socket_id > 0)
-                       break;
-#endif
-
-               /* if we didn't specifically request memory on this socket */
-               skip = active_sockets != 0 &&
-                               internal_config.socket_mem[socket_id] == 0;
-               /* ...or if we didn't specifically request memory on *any*
-                * socket, and this is not master lcore
-                */
-               master_lcore_socket = rte_lcore_to_socket_id(cfg->master_lcore);
-               skip |= active_sockets == 0 && socket_id != master_lcore_socket;
-
-               if (skip) {
-                       RTE_LOG(DEBUG, EAL, "Will not preallocate memory on socket %u\n",
-                                       socket_id);
-                       continue;
-               }
-
-               /* max amount of memory on this socket */
-               max_socket_mem = (active_sockets != 0 ?
-                                       internal_config.socket_mem[socket_id] :
-                                       internal_config.memory) +
-                                       extra_mem_per_socket;
-               cur_socket_mem = 0;
-
-               for (hpi_idx = 0; hpi_idx < hp_sizes; hpi_idx++) {
-                       uint64_t max_pagesz_mem, cur_pagesz_mem = 0;
-                       uint64_t hugepage_sz;
-                       struct hugepage_info *hpi;
-                       int type_msl_idx, max_segs, total_segs = 0;
-
-                       hpi = &internal_config.hugepage_info[hpi_idx];
-                       hugepage_sz = hpi->hugepage_sz;
-
-                       /* check if pages are actually available */
-                       if (hpi->num_pages[socket_id] == 0)
-                               continue;
-
-                       max_segs = RTE_MAX_MEMSEG_PER_TYPE;
-                       max_pagesz_mem = max_socket_mem - cur_socket_mem;
-
-                       /* make it multiple of page size */
-                       max_pagesz_mem = RTE_ALIGN_FLOOR(max_pagesz_mem,
-                                       hugepage_sz);
-
-                       RTE_LOG(DEBUG, EAL, "Attempting to preallocate "
-                                       "%" PRIu64 "M on socket %i\n",
-                                       max_pagesz_mem >> 20, socket_id);
-
-                       type_msl_idx = 0;
-                       while (cur_pagesz_mem < max_pagesz_mem &&
-                                       total_segs < max_segs) {
-                               uint64_t cur_mem;
-                               unsigned int n_segs;
-
-                               if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
-                                       RTE_LOG(ERR, EAL,
-                                               "No more space in memseg lists, please increase %s\n",
-                                               RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
-                                       return -1;
-                               }
-
-                               msl = &mcfg->memsegs[msl_idx];
-
-                               cur_mem = get_mem_amount(hugepage_sz,
-                                               max_pagesz_mem);
-                               n_segs = cur_mem / hugepage_sz;
-
-                               if (alloc_memseg_list(msl, hugepage_sz, n_segs,
-                                               socket_id, type_msl_idx)) {
-                                       /* failing to allocate a memseg list is
-                                        * a serious error.
-                                        */
-                                       RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
-                                       return -1;
-                               }
-
-                               if (alloc_va_space(msl)) {
-                                       /* if we couldn't allocate VA space, we
-                                        * can try with smaller page sizes.
-                                        */
-                                       RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list, retrying with different page size\n");
-                                       /* deallocate memseg list */
-                                       if (free_memseg_list(msl))
-                                               return -1;
-                                       break;
-                               }
-
-                               total_segs += msl->memseg_arr.len;
-                               cur_pagesz_mem = total_segs * hugepage_sz;
-                               type_msl_idx++;
-                               msl_idx++;
-                       }
-                       cur_socket_mem += cur_pagesz_mem;
-               }
-               if (cur_socket_mem == 0) {
-                       RTE_LOG(ERR, EAL, "Cannot allocate VA space on socket %u\n",
-                               socket_id);
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-static int __rte_unused
-memseg_primary_init(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       struct memtype {
-               uint64_t page_sz;
-               int socket_id;
-       } *memtypes = NULL;
-       int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
-       struct rte_memseg_list *msl;
-       uint64_t max_mem, max_mem_per_type;
-       unsigned int max_seglists_per_type;
-       unsigned int n_memtypes, cur_type;
-
-       /* no-huge does not need this at all */
-       if (internal_config.no_hugetlbfs)
-               return 0;
-
-       /*
-        * figuring out amount of memory we're going to have is a long and very
-        * involved process. the basic element we're operating with is a memory
-        * type, defined as a combination of NUMA node ID and page size (so that
-        * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
-        *
-        * deciding amount of memory going towards each memory type is a
-        * balancing act between maximum segments per type, maximum memory per
-        * type, and number of detected NUMA nodes. the goal is to make sure
-        * each memory type gets at least one memseg list.
-        *
-        * the total amount of memory is limited by RTE_MAX_MEM_MB value.
-        *
-        * the total amount of memory per type is limited by either
-        * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
-        * of detected NUMA nodes. additionally, maximum number of segments per
-        * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
-        * smaller page sizes, it can take hundreds of thousands of segments to
-        * reach the above specified per-type memory limits.
-        *
-        * additionally, each type may have multiple memseg lists associated
-        * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
-        * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
-        *
-        * the number of memseg lists per type is decided based on the above
-        * limits, and also taking number of detected NUMA nodes, to make sure
-        * that we don't run out of memseg lists before we populate all NUMA
-        * nodes with memory.
-        *
-        * we do this in three stages. first, we collect the number of types.
-        * then, we figure out memory constraints and populate the list of
-        * would-be memseg lists. then, we go ahead and allocate the memseg
-        * lists.
-        */
-
-       /* create space for mem types */
-       n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
-       memtypes = calloc(n_memtypes, sizeof(*memtypes));
-       if (memtypes == NULL) {
-               RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
-               return -1;
-       }
-
-       /* populate mem types */
-       cur_type = 0;
-       for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
-                       hpi_idx++) {
-               struct hugepage_info *hpi;
-               uint64_t hugepage_sz;
-
-               hpi = &internal_config.hugepage_info[hpi_idx];
-               hugepage_sz = hpi->hugepage_sz;
-
-               for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
-                       int socket_id = rte_socket_id_by_idx(i);
-
-#ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
-                       if (socket_id > 0)
-                               break;
-#endif
-                       memtypes[cur_type].page_sz = hugepage_sz;
-                       memtypes[cur_type].socket_id = socket_id;
-
-                       RTE_LOG(DEBUG, EAL, "Detected memory type: "
-                               "socket_id:%u hugepage_sz:%" PRIu64 "\n",
-                               socket_id, hugepage_sz);
-               }
-       }
-       /* number of memtypes could have been lower due to no NUMA support */
-       n_memtypes = cur_type;
-
-       /* set up limits for types */
-       max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
-       max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
-                       max_mem / n_memtypes);
-       /*
-        * limit maximum number of segment lists per type to ensure there's
-        * space for memseg lists for all NUMA nodes with all page sizes
-        */
-       max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
-
-       if (max_seglists_per_type == 0) {
-               RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
-                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
-               goto out;
-       }
-
-       /* go through all mem types and create segment lists */
-       msl_idx = 0;
-       for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
-               unsigned int cur_seglist, n_seglists, n_segs;
-               unsigned int max_segs_per_type, max_segs_per_list;
-               struct memtype *type = &memtypes[cur_type];
-               uint64_t max_mem_per_list, pagesz;
-               int socket_id;
-
-               pagesz = type->page_sz;
-               socket_id = type->socket_id;
-
-               /*
-                * we need to create segment lists for this type. we must take
-                * into account the following things:
-                *
-                * 1. total amount of memory we can use for this memory type
-                * 2. total amount of memory per memseg list allowed
-                * 3. number of segments needed to fit the amount of memory
-                * 4. number of segments allowed per type
-                * 5. number of segments allowed per memseg list
-                * 6. number of memseg lists we are allowed to take up
-                */
-
-               /* calculate how much segments we will need in total */
-               max_segs_per_type = max_mem_per_type / pagesz;
-               /* limit number of segments to maximum allowed per type */
-               max_segs_per_type = RTE_MIN(max_segs_per_type,
-                               (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
-               /* limit number of segments to maximum allowed per list */
-               max_segs_per_list = RTE_MIN(max_segs_per_type,
-                               (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
-
-               /* calculate how much memory we can have per segment list */
-               max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
-                               (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
-
-               /* calculate how many segments each segment list will have */
-               n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
-
-               /* calculate how many segment lists we can have */
-               n_seglists = RTE_MIN(max_segs_per_type / n_segs,
-                               max_mem_per_type / max_mem_per_list);
-
-               /* limit number of segment lists according to our maximum */
-               n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
-
-               RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
-                               "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
-                       n_seglists, n_segs, socket_id, pagesz);
-
-               /* create all segment lists */
-               for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
-                       if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
-                               RTE_LOG(ERR, EAL,
-                                       "No more space in memseg lists, please increase %s\n",
-                                       RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
-                               goto out;
-                       }
-                       msl = &mcfg->memsegs[msl_idx++];
-
-                       if (alloc_memseg_list(msl, pagesz, n_segs,
-                                       socket_id, cur_seglist))
-                               goto out;
-
-                       if (alloc_va_space(msl)) {
-                               RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
-                               goto out;
-                       }
-               }
-       }
-       /* we're successful */
-       ret = 0;
-out:
-       free(memtypes);
-       return ret;
-}
-
-static int
-memseg_secondary_init(void)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       int msl_idx = 0;
-       struct rte_memseg_list *msl;
-
-       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
-
-               msl = &mcfg->memsegs[msl_idx];
-
-               /* skip empty memseg lists */
-               if (msl->memseg_arr.len == 0)
-                       continue;
-
-               if (rte_fbarray_attach(&msl->memseg_arr)) {
-                       RTE_LOG(ERR, EAL, "Cannot attach to primary process memseg lists\n");
-                       return -1;
-               }
-
-               /* preallocate VA space */
-               if (alloc_va_space(msl)) {
-                       RTE_LOG(ERR, EAL, "Cannot preallocate VA space for hugepage memory\n");
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-int
-rte_eal_memseg_init(void)
-{
-       /* increase rlimit to maximum */
-       struct rlimit lim;
-
-       if (getrlimit(RLIMIT_NOFILE, &lim) == 0) {
-               /* set limit to maximum */
-               lim.rlim_cur = lim.rlim_max;
-
-               if (setrlimit(RLIMIT_NOFILE, &lim) < 0) {
-                       RTE_LOG(DEBUG, EAL, "Setting maximum number of open files failed: %s\n",
-                                       strerror(errno));
-               } else {
-                       RTE_LOG(DEBUG, EAL, "Setting maximum number of open files to %"
-                                       PRIu64 "\n",
-                                       (uint64_t)lim.rlim_cur);
-               }
-       } else {
-               RTE_LOG(ERR, EAL, "Cannot get current resource limits\n");
-       }
-
-       return rte_eal_process_type() == RTE_PROC_PRIMARY ?
-#ifndef RTE_ARCH_64
-                       memseg_primary_init_32() :
-#else
-                       memseg_primary_init() :
-#endif
-                       memseg_secondary_init();
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_thread.c b/lib/librte_eal/linuxapp/eal/eal_thread.c
deleted file mode 100644 (file)
index 379773b..0000000
+++ /dev/null
@@ -1,188 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <pthread.h>
-#include <sched.h>
-#include <sys/queue.h>
-#include <sys/syscall.h>
-
-#include <rte_debug.h>
-#include <rte_atomic.h>
-#include <rte_launch.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_per_lcore.h>
-#include <rte_eal.h>
-#include <rte_lcore.h>
-
-#include "eal_private.h"
-#include "eal_thread.h"
-
-RTE_DEFINE_PER_LCORE(unsigned, _lcore_id) = LCORE_ID_ANY;
-RTE_DEFINE_PER_LCORE(unsigned, _socket_id) = (unsigned)SOCKET_ID_ANY;
-RTE_DEFINE_PER_LCORE(rte_cpuset_t, _cpuset);
-
-/*
- * Send a message to a slave lcore identified by slave_id to call a
- * function f with argument arg. Once the execution is done, the
- * remote lcore switch in FINISHED state.
- */
-int
-rte_eal_remote_launch(int (*f)(void *), void *arg, unsigned slave_id)
-{
-       int n;
-       char c = 0;
-       int m2s = lcore_config[slave_id].pipe_master2slave[1];
-       int s2m = lcore_config[slave_id].pipe_slave2master[0];
-
-       if (lcore_config[slave_id].state != WAIT)
-               return -EBUSY;
-
-       lcore_config[slave_id].f = f;
-       lcore_config[slave_id].arg = arg;
-
-       /* send message */
-       n = 0;
-       while (n == 0 || (n < 0 && errno == EINTR))
-               n = write(m2s, &c, 1);
-       if (n < 0)
-               rte_panic("cannot write on configuration pipe\n");
-
-       /* wait ack */
-       do {
-               n = read(s2m, &c, 1);
-       } while (n < 0 && errno == EINTR);
-
-       if (n <= 0)
-               rte_panic("cannot read on configuration pipe\n");
-
-       return 0;
-}
-
-/* set affinity for current EAL thread */
-static int
-eal_thread_set_affinity(void)
-{
-       unsigned lcore_id = rte_lcore_id();
-
-       /* acquire system unique id  */
-       rte_gettid();
-
-       /* update EAL thread core affinity */
-       return rte_thread_set_affinity(&lcore_config[lcore_id].cpuset);
-}
-
-void eal_thread_init_master(unsigned lcore_id)
-{
-       /* set the lcore ID in per-lcore memory area */
-       RTE_PER_LCORE(_lcore_id) = lcore_id;
-
-       /* set CPU affinity */
-       if (eal_thread_set_affinity() < 0)
-               rte_panic("cannot set affinity\n");
-}
-
-/* main loop of threads */
-__attribute__((noreturn)) void *
-eal_thread_loop(__attribute__((unused)) void *arg)
-{
-       char c;
-       int n, ret;
-       unsigned lcore_id;
-       pthread_t thread_id;
-       int m2s, s2m;
-       char cpuset[RTE_CPU_AFFINITY_STR_LEN];
-
-       thread_id = pthread_self();
-
-       /* retrieve our lcore_id from the configuration structure */
-       RTE_LCORE_FOREACH_SLAVE(lcore_id) {
-               if (thread_id == lcore_config[lcore_id].thread_id)
-                       break;
-       }
-       if (lcore_id == RTE_MAX_LCORE)
-               rte_panic("cannot retrieve lcore id\n");
-
-       m2s = lcore_config[lcore_id].pipe_master2slave[0];
-       s2m = lcore_config[lcore_id].pipe_slave2master[1];
-
-       /* set the lcore ID in per-lcore memory area */
-       RTE_PER_LCORE(_lcore_id) = lcore_id;
-
-       /* set CPU affinity */
-       if (eal_thread_set_affinity() < 0)
-               rte_panic("cannot set affinity\n");
-
-       ret = eal_thread_dump_affinity(cpuset, sizeof(cpuset));
-
-       RTE_LOG(DEBUG, EAL, "lcore %u is ready (tid=%zx;cpuset=[%s%s])\n",
-               lcore_id, (uintptr_t)thread_id, cpuset, ret == 0 ? "" : "...");
-
-       /* read on our pipe to get commands */
-       while (1) {
-               void *fct_arg;
-
-               /* wait command */
-               do {
-                       n = read(m2s, &c, 1);
-               } while (n < 0 && errno == EINTR);
-
-               if (n <= 0)
-                       rte_panic("cannot read on configuration pipe\n");
-
-               lcore_config[lcore_id].state = RUNNING;
-
-               /* send ack */
-               n = 0;
-               while (n == 0 || (n < 0 && errno == EINTR))
-                       n = write(s2m, &c, 1);
-               if (n < 0)
-                       rte_panic("cannot write on configuration pipe\n");
-
-               if (lcore_config[lcore_id].f == NULL)
-                       rte_panic("NULL function pointer\n");
-
-               /* call the function and store the return value */
-               fct_arg = lcore_config[lcore_id].arg;
-               ret = lcore_config[lcore_id].f(fct_arg);
-               lcore_config[lcore_id].ret = ret;
-               rte_wmb();
-
-               /* when a service core returns, it should go directly to WAIT
-                * state, because the application will not lcore_wait() for it.
-                */
-               if (lcore_config[lcore_id].core_role == ROLE_SERVICE)
-                       lcore_config[lcore_id].state = WAIT;
-               else
-                       lcore_config[lcore_id].state = FINISHED;
-       }
-
-       /* never reached */
-       /* pthread_exit(NULL); */
-       /* return NULL; */
-}
-
-/* require calling thread tid by gettid() */
-int rte_sys_gettid(void)
-{
-       return (int)syscall(SYS_gettid);
-}
-
-int rte_thread_setname(pthread_t id, const char *name)
-{
-       int ret = ENOSYS;
-#if defined(__GLIBC__) && defined(__GLIBC_PREREQ)
-#if __GLIBC_PREREQ(2, 12)
-       ret = pthread_setname_np(id, name);
-#endif
-#endif
-       RTE_SET_USED(id);
-       RTE_SET_USED(name);
-       return -ret;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_timer.c b/lib/librte_eal/linuxapp/eal/eal_timer.c
deleted file mode 100644 (file)
index bc8f051..0000000
+++ /dev/null
@@ -1,266 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation.
- * Copyright(c) 2012-2013 6WIND S.A.
- */
-
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <sys/mman.h>
-#include <sys/queue.h>
-#include <pthread.h>
-#include <errno.h>
-
-#include <rte_common.h>
-#include <rte_log.h>
-#include <rte_cycles.h>
-#include <rte_lcore.h>
-#include <rte_memory.h>
-#include <rte_eal.h>
-#include <rte_debug.h>
-
-#include "eal_private.h"
-#include "eal_internal_cfg.h"
-
-enum timer_source eal_timer_source = EAL_TIMER_HPET;
-
-#ifdef RTE_LIBEAL_USE_HPET
-
-#define DEV_HPET "/dev/hpet"
-
-/* Maximum number of counters. */
-#define HPET_TIMER_NUM 3
-
-/* General capabilities register */
-#define CLK_PERIOD_SHIFT     32 /* Clock period shift. */
-#define CLK_PERIOD_MASK      0xffffffff00000000ULL /* Clock period mask. */
-
-/**
- * HPET timer registers. From the Intel IA-PC HPET (High Precision Event
- * Timers) Specification.
- */
-struct eal_hpet_regs {
-       /* Memory-mapped, software visible registers */
-       uint64_t capabilities;      /**< RO General Capabilities Register. */
-       uint64_t reserved0;         /**< Reserved for future use. */
-       uint64_t config;            /**< RW General Configuration Register. */
-       uint64_t reserved1;         /**< Reserved for future use. */
-       uint64_t isr;               /**< RW Clear General Interrupt Status. */
-       uint64_t reserved2[25];     /**< Reserved for future use. */
-       union {
-               uint64_t counter;   /**< RW Main Counter Value Register. */
-               struct {
-                       uint32_t counter_l; /**< RW Main Counter Low. */
-                       uint32_t counter_h; /**< RW Main Counter High. */
-               };
-       };
-       uint64_t reserved3;         /**< Reserved for future use. */
-       struct {
-               uint64_t config;    /**< RW Timer Config and Capability Reg. */
-               uint64_t comp;      /**< RW Timer Comparator Value Register. */
-               uint64_t fsb;       /**< RW FSB Interrupt Route Register. */
-               uint64_t reserved4; /**< Reserved for future use. */
-       } timers[HPET_TIMER_NUM]; /**< Set of HPET timers. */
-};
-
-/* Mmap'd hpet registers */
-static volatile struct eal_hpet_regs *eal_hpet = NULL;
-
-/* Period at which the HPET counter increments in
- * femtoseconds (10^-15 seconds). */
-static uint32_t eal_hpet_resolution_fs = 0;
-
-/* Frequency of the HPET counter in Hz */
-static uint64_t eal_hpet_resolution_hz = 0;
-
-/* Incremented 4 times during one 32bits hpet full count */
-static uint32_t eal_hpet_msb;
-
-static pthread_t msb_inc_thread_id;
-
-/*
- * This function runs on a specific thread to update a global variable
- * containing used to process MSB of the HPET (unfortunately, we need
- * this because hpet is 32 bits by default under linux).
- */
-static void *
-hpet_msb_inc(__attribute__((unused)) void *arg)
-{
-       uint32_t t;
-
-       while (1) {
-               t = (eal_hpet->counter_l >> 30);
-               if (t != (eal_hpet_msb & 3))
-                       eal_hpet_msb ++;
-               sleep(10);
-       }
-       return NULL;
-}
-
-uint64_t
-rte_get_hpet_hz(void)
-{
-       if(internal_config.no_hpet)
-               rte_panic("Error, HPET called, but no HPET present\n");
-
-       return eal_hpet_resolution_hz;
-}
-
-uint64_t
-rte_get_hpet_cycles(void)
-{
-       uint32_t t, msb;
-       uint64_t ret;
-
-       if(internal_config.no_hpet)
-               rte_panic("Error, HPET called, but no HPET present\n");
-
-       t = eal_hpet->counter_l;
-       msb = eal_hpet_msb;
-       ret = (msb + 2 - (t >> 30)) / 4;
-       ret <<= 32;
-       ret += t;
-       return ret;
-}
-
-#endif
-
-#ifdef RTE_LIBEAL_USE_HPET
-/*
- * Open and mmap /dev/hpet (high precision event timer) that will
- * provide our time reference.
- */
-int
-rte_eal_hpet_init(int make_default)
-{
-       int fd, ret;
-
-       if (internal_config.no_hpet) {
-               RTE_LOG(NOTICE, EAL, "HPET is disabled\n");
-               return -1;
-       }
-
-       fd = open(DEV_HPET, O_RDONLY);
-       if (fd < 0) {
-               RTE_LOG(ERR, EAL, "ERROR: Cannot open "DEV_HPET": %s!\n",
-                       strerror(errno));
-               internal_config.no_hpet = 1;
-               return -1;
-       }
-       eal_hpet = mmap(NULL, 1024, PROT_READ, MAP_SHARED, fd, 0);
-       if (eal_hpet == MAP_FAILED) {
-               RTE_LOG(ERR, EAL, "ERROR: Cannot mmap "DEV_HPET"!\n"
-                               "Please enable CONFIG_HPET_MMAP in your kernel configuration "
-                               "to allow HPET support.\n"
-                               "To run without using HPET, set CONFIG_RTE_LIBEAL_USE_HPET=n "
-                               "in your build configuration or use '--no-hpet' EAL flag.\n");
-               close(fd);
-               internal_config.no_hpet = 1;
-               return -1;
-       }
-       close(fd);
-
-       eal_hpet_resolution_fs = (uint32_t)((eal_hpet->capabilities &
-                                       CLK_PERIOD_MASK) >>
-                                       CLK_PERIOD_SHIFT);
-
-       eal_hpet_resolution_hz = (1000ULL*1000ULL*1000ULL*1000ULL*1000ULL) /
-               (uint64_t)eal_hpet_resolution_fs;
-
-       RTE_LOG(INFO, EAL, "HPET frequency is ~%"PRIu64" kHz\n",
-                       eal_hpet_resolution_hz/1000);
-
-       eal_hpet_msb = (eal_hpet->counter_l >> 30);
-
-       /* create a thread that will increment a global variable for
-        * msb (hpet is 32 bits by default under linux) */
-       ret = rte_ctrl_thread_create(&msb_inc_thread_id, "hpet-msb-inc", NULL,
-                                    hpet_msb_inc, NULL);
-       if (ret != 0) {
-               RTE_LOG(ERR, EAL, "ERROR: Cannot create HPET timer thread!\n");
-               internal_config.no_hpet = 1;
-               return -1;
-       }
-
-       if (make_default)
-               eal_timer_source = EAL_TIMER_HPET;
-       return 0;
-}
-#endif
-
-static void
-check_tsc_flags(void)
-{
-       char line[512];
-       FILE *stream;
-
-       stream = fopen("/proc/cpuinfo", "r");
-       if (!stream) {
-               RTE_LOG(WARNING, EAL, "WARNING: Unable to open /proc/cpuinfo\n");
-               return;
-       }
-
-       while (fgets(line, sizeof line, stream)) {
-               char *constant_tsc;
-               char *nonstop_tsc;
-
-               if (strncmp(line, "flags", 5) != 0)
-                       continue;
-
-               constant_tsc = strstr(line, "constant_tsc");
-               nonstop_tsc = strstr(line, "nonstop_tsc");
-               if (!constant_tsc || !nonstop_tsc)
-                       RTE_LOG(WARNING, EAL,
-                               "WARNING: cpu flags "
-                               "constant_tsc=%s "
-                               "nonstop_tsc=%s "
-                               "-> using unreliable clock cycles !\n",
-                               constant_tsc ? "yes":"no",
-                               nonstop_tsc ? "yes":"no");
-               break;
-       }
-
-       fclose(stream);
-}
-
-uint64_t
-get_tsc_freq(void)
-{
-#ifdef CLOCK_MONOTONIC_RAW
-#define NS_PER_SEC 1E9
-
-       struct timespec sleeptime = {.tv_nsec = NS_PER_SEC / 10 }; /* 1/10 second */
-
-       struct timespec t_start, t_end;
-       uint64_t tsc_hz;
-
-       if (clock_gettime(CLOCK_MONOTONIC_RAW, &t_start) == 0) {
-               uint64_t ns, end, start = rte_rdtsc();
-               nanosleep(&sleeptime,NULL);
-               clock_gettime(CLOCK_MONOTONIC_RAW, &t_end);
-               end = rte_rdtsc();
-               ns = ((t_end.tv_sec - t_start.tv_sec) * NS_PER_SEC);
-               ns += (t_end.tv_nsec - t_start.tv_nsec);
-
-               double secs = (double)ns/NS_PER_SEC;
-               tsc_hz = (uint64_t)((end - start)/secs);
-               return tsc_hz;
-       }
-#endif
-       return 0;
-}
-
-int
-rte_eal_timer_init(void)
-{
-
-       eal_timer_source = EAL_TIMER_TSC;
-
-       set_tsc_freq();
-       check_tsc_flags();
-       return 0;
-}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
deleted file mode 100644 (file)
index c821e83..0000000
+++ /dev/null
@@ -1,2049 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <inttypes.h>
-#include <string.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sys/ioctl.h>
-
-#include <rte_errno.h>
-#include <rte_log.h>
-#include <rte_memory.h>
-#include <rte_eal_memconfig.h>
-#include <rte_vfio.h>
-
-#include "eal_filesystem.h"
-#include "eal_vfio.h"
-#include "eal_private.h"
-
-#ifdef VFIO_PRESENT
-
-#define VFIO_MEM_EVENT_CLB_NAME "vfio_mem_event_clb"
-
-/* hot plug/unplug of VFIO groups may cause all DMA maps to be dropped. we can
- * recreate the mappings for DPDK segments, but we cannot do so for memory that
- * was registered by the user themselves, so we need to store the user mappings
- * somewhere, to recreate them later.
- */
-#define VFIO_MAX_USER_MEM_MAPS 256
-struct user_mem_map {
-       uint64_t addr;
-       uint64_t iova;
-       uint64_t len;
-};
-
-struct user_mem_maps {
-       rte_spinlock_recursive_t lock;
-       int n_maps;
-       struct user_mem_map maps[VFIO_MAX_USER_MEM_MAPS];
-};
-
-struct vfio_config {
-       int vfio_enabled;
-       int vfio_container_fd;
-       int vfio_active_groups;
-       const struct vfio_iommu_type *vfio_iommu_type;
-       struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
-       struct user_mem_maps mem_maps;
-};
-
-/* per-process VFIO config */
-static struct vfio_config vfio_cfgs[VFIO_MAX_CONTAINERS];
-static struct vfio_config *default_vfio_cfg = &vfio_cfgs[0];
-
-static int vfio_type1_dma_map(int);
-static int vfio_type1_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_spapr_dma_map(int);
-static int vfio_spapr_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_noiommu_dma_map(int);
-static int vfio_noiommu_dma_mem_map(int, uint64_t, uint64_t, uint64_t, int);
-static int vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr,
-               uint64_t iova, uint64_t len, int do_map);
-
-/* IOMMU types we support */
-static const struct vfio_iommu_type iommu_types[] = {
-       /* x86 IOMMU, otherwise known as type 1 */
-       {
-               .type_id = RTE_VFIO_TYPE1,
-               .name = "Type 1",
-               .dma_map_func = &vfio_type1_dma_map,
-               .dma_user_map_func = &vfio_type1_dma_mem_map
-       },
-       /* ppc64 IOMMU, otherwise known as spapr */
-       {
-               .type_id = RTE_VFIO_SPAPR,
-               .name = "sPAPR",
-               .dma_map_func = &vfio_spapr_dma_map,
-               .dma_user_map_func = &vfio_spapr_dma_mem_map
-       },
-       /* IOMMU-less mode */
-       {
-               .type_id = RTE_VFIO_NOIOMMU,
-               .name = "No-IOMMU",
-               .dma_map_func = &vfio_noiommu_dma_map,
-               .dma_user_map_func = &vfio_noiommu_dma_mem_map
-       },
-};
-
-static int
-is_null_map(const struct user_mem_map *map)
-{
-       return map->addr == 0 && map->iova == 0 && map->len == 0;
-}
-
-/* we may need to merge user mem maps together in case of user mapping/unmapping
- * chunks of memory, so we'll need a comparator function to sort segments.
- */
-static int
-user_mem_map_cmp(const void *a, const void *b)
-{
-       const struct user_mem_map *umm_a = a;
-       const struct user_mem_map *umm_b = b;
-
-       /* move null entries to end */
-       if (is_null_map(umm_a))
-               return 1;
-       if (is_null_map(umm_b))
-               return -1;
-
-       /* sort by iova first */
-       if (umm_a->iova < umm_b->iova)
-               return -1;
-       if (umm_a->iova > umm_b->iova)
-               return 1;
-
-       if (umm_a->addr < umm_b->addr)
-               return -1;
-       if (umm_a->addr > umm_b->addr)
-               return 1;
-
-       if (umm_a->len < umm_b->len)
-               return -1;
-       if (umm_a->len > umm_b->len)
-               return 1;
-
-       return 0;
-}
-
-/* adjust user map entry. this may result in shortening of existing map, or in
- * splitting existing map in two pieces.
- */
-static void
-adjust_map(struct user_mem_map *src, struct user_mem_map *end,
-               uint64_t remove_va_start, uint64_t remove_len)
-{
-       /* if va start is same as start address, we're simply moving start */
-       if (remove_va_start == src->addr) {
-               src->addr += remove_len;
-               src->iova += remove_len;
-               src->len -= remove_len;
-       } else if (remove_va_start + remove_len == src->addr + src->len) {
-               /* we're shrinking mapping from the end */
-               src->len -= remove_len;
-       } else {
-               /* we're blowing a hole in the middle */
-               struct user_mem_map tmp;
-               uint64_t total_len = src->len;
-
-               /* adjust source segment length */
-               src->len = remove_va_start - src->addr;
-
-               /* create temporary segment in the middle */
-               tmp.addr = src->addr + src->len;
-               tmp.iova = src->iova + src->len;
-               tmp.len = remove_len;
-
-               /* populate end segment - this one we will be keeping */
-               end->addr = tmp.addr + tmp.len;
-               end->iova = tmp.iova + tmp.len;
-               end->len = total_len - src->len - tmp.len;
-       }
-}
-
-/* try merging two maps into one, return 1 if succeeded */
-static int
-merge_map(struct user_mem_map *left, struct user_mem_map *right)
-{
-       if (left->addr + left->len != right->addr)
-               return 0;
-       if (left->iova + left->len != right->iova)
-               return 0;
-
-       left->len += right->len;
-
-       memset(right, 0, sizeof(*right));
-
-       return 1;
-}
-
-static struct user_mem_map *
-find_user_mem_map(struct user_mem_maps *user_mem_maps, uint64_t addr,
-               uint64_t iova, uint64_t len)
-{
-       uint64_t va_end = addr + len;
-       uint64_t iova_end = iova + len;
-       int i;
-
-       for (i = 0; i < user_mem_maps->n_maps; i++) {
-               struct user_mem_map *map = &user_mem_maps->maps[i];
-               uint64_t map_va_end = map->addr + map->len;
-               uint64_t map_iova_end = map->iova + map->len;
-
-               /* check start VA */
-               if (addr < map->addr || addr >= map_va_end)
-                       continue;
-               /* check if VA end is within boundaries */
-               if (va_end <= map->addr || va_end > map_va_end)
-                       continue;
-
-               /* check start IOVA */
-               if (iova < map->iova || iova >= map_iova_end)
-                       continue;
-               /* check if IOVA end is within boundaries */
-               if (iova_end <= map->iova || iova_end > map_iova_end)
-                       continue;
-
-               /* we've found our map */
-               return map;
-       }
-       return NULL;
-}
-
-/* this will sort all user maps, and merge/compact any adjacent maps */
-static void
-compact_user_maps(struct user_mem_maps *user_mem_maps)
-{
-       int i, n_merged, cur_idx;
-
-       qsort(user_mem_maps->maps, user_mem_maps->n_maps,
-                       sizeof(user_mem_maps->maps[0]), user_mem_map_cmp);
-
-       /* we'll go over the list backwards when merging */
-       n_merged = 0;
-       for (i = user_mem_maps->n_maps - 2; i >= 0; i--) {
-               struct user_mem_map *l, *r;
-
-               l = &user_mem_maps->maps[i];
-               r = &user_mem_maps->maps[i + 1];
-
-               if (is_null_map(l) || is_null_map(r))
-                       continue;
-
-               if (merge_map(l, r))
-                       n_merged++;
-       }
-
-       /* the entries are still sorted, but now they have holes in them, so
-        * walk through the list and remove the holes
-        */
-       if (n_merged > 0) {
-               cur_idx = 0;
-               for (i = 0; i < user_mem_maps->n_maps; i++) {
-                       if (!is_null_map(&user_mem_maps->maps[i])) {
-                               struct user_mem_map *src, *dst;
-
-                               src = &user_mem_maps->maps[i];
-                               dst = &user_mem_maps->maps[cur_idx++];
-
-                               if (src != dst) {
-                                       memcpy(dst, src, sizeof(*src));
-                                       memset(src, 0, sizeof(*src));
-                               }
-                       }
-               }
-               user_mem_maps->n_maps = cur_idx;
-       }
-}
-
-static int
-vfio_open_group_fd(int iommu_group_num)
-{
-       int vfio_group_fd;
-       char filename[PATH_MAX];
-       struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
-       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
-       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
-       /* if primary, try to open the group */
-       if (internal_config.process_type == RTE_PROC_PRIMARY) {
-               /* try regular group format */
-               snprintf(filename, sizeof(filename),
-                                VFIO_GROUP_FMT, iommu_group_num);
-               vfio_group_fd = open(filename, O_RDWR);
-               if (vfio_group_fd < 0) {
-                       /* if file not found, it's not an error */
-                       if (errno != ENOENT) {
-                               RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
-                                               strerror(errno));
-                               return -1;
-                       }
-
-                       /* special case: try no-IOMMU path as well */
-                       snprintf(filename, sizeof(filename),
-                                       VFIO_NOIOMMU_GROUP_FMT,
-                                       iommu_group_num);
-                       vfio_group_fd = open(filename, O_RDWR);
-                       if (vfio_group_fd < 0) {
-                               if (errno != ENOENT) {
-                                       RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", filename,
-                                                       strerror(errno));
-                                       return -1;
-                               }
-                               return 0;
-                       }
-                       /* noiommu group found */
-               }
-
-               return vfio_group_fd;
-       }
-       /* if we're in a secondary process, request group fd from the primary
-        * process via mp channel.
-        */
-       p->req = SOCKET_REQ_GROUP;
-       p->group_num = iommu_group_num;
-       strcpy(mp_req.name, EAL_VFIO_MP);
-       mp_req.len_param = sizeof(*p);
-       mp_req.num_fds = 0;
-
-       vfio_group_fd = -1;
-       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
-           mp_reply.nb_received == 1) {
-               mp_rep = &mp_reply.msgs[0];
-               p = (struct vfio_mp_param *)mp_rep->param;
-               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
-                       vfio_group_fd = mp_rep->fds[0];
-               } else if (p->result == SOCKET_NO_FD) {
-                       RTE_LOG(ERR, EAL, "  bad VFIO group fd\n");
-                       vfio_group_fd = 0;
-               }
-               free(mp_reply.msgs);
-       }
-
-       if (vfio_group_fd < 0)
-               RTE_LOG(ERR, EAL, "  cannot request group fd\n");
-       return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_num(int iommu_group_num)
-{
-       struct vfio_config *vfio_cfg;
-       int i, j;
-
-       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
-               vfio_cfg = &vfio_cfgs[i];
-               for (j = 0; j < VFIO_MAX_GROUPS; j++) {
-                       if (vfio_cfg->vfio_groups[j].group_num ==
-                                       iommu_group_num)
-                               return vfio_cfg;
-               }
-       }
-
-       return NULL;
-}
-
-static int
-vfio_get_group_fd(struct vfio_config *vfio_cfg,
-               int iommu_group_num)
-{
-       int i;
-       int vfio_group_fd;
-       struct vfio_group *cur_grp;
-
-       /* check if we already have the group descriptor open */
-       for (i = 0; i < VFIO_MAX_GROUPS; i++)
-               if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num)
-                       return vfio_cfg->vfio_groups[i].fd;
-
-       /* Lets see first if there is room for a new group */
-       if (vfio_cfg->vfio_active_groups == VFIO_MAX_GROUPS) {
-               RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
-               return -1;
-       }
-
-       /* Now lets get an index for the new group */
-       for (i = 0; i < VFIO_MAX_GROUPS; i++)
-               if (vfio_cfg->vfio_groups[i].group_num == -1) {
-                       cur_grp = &vfio_cfg->vfio_groups[i];
-                       break;
-               }
-
-       /* This should not happen */
-       if (i == VFIO_MAX_GROUPS) {
-               RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
-               return -1;
-       }
-
-       vfio_group_fd = vfio_open_group_fd(iommu_group_num);
-       if (vfio_group_fd < 0) {
-               RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num);
-               return -1;
-       }
-
-       cur_grp->group_num = iommu_group_num;
-       cur_grp->fd = vfio_group_fd;
-       vfio_cfg->vfio_active_groups++;
-
-       return vfio_group_fd;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_group_fd(int vfio_group_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i, j;
-
-       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
-               vfio_cfg = &vfio_cfgs[i];
-               for (j = 0; j < VFIO_MAX_GROUPS; j++)
-                       if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
-                               return vfio_cfg;
-       }
-
-       return NULL;
-}
-
-static struct vfio_config *
-get_vfio_cfg_by_container_fd(int container_fd)
-{
-       int i;
-
-       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
-               if (vfio_cfgs[i].vfio_container_fd == container_fd)
-                       return &vfio_cfgs[i];
-       }
-
-       return NULL;
-}
-
-int
-rte_vfio_get_group_fd(int iommu_group_num)
-{
-       struct vfio_config *vfio_cfg;
-
-       /* get the vfio_config it belongs to */
-       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
-       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-
-       return vfio_get_group_fd(vfio_cfg, iommu_group_num);
-}
-
-static int
-get_vfio_group_idx(int vfio_group_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i, j;
-
-       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
-               vfio_cfg = &vfio_cfgs[i];
-               for (j = 0; j < VFIO_MAX_GROUPS; j++)
-                       if (vfio_cfg->vfio_groups[j].fd == vfio_group_fd)
-                               return j;
-       }
-
-       return -1;
-}
-
-static void
-vfio_group_device_get(int vfio_group_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i;
-
-       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
-               return;
-       }
-
-       i = get_vfio_group_idx(vfio_group_fd);
-       if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
-               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
-       else
-               vfio_cfg->vfio_groups[i].devices++;
-}
-
-static void
-vfio_group_device_put(int vfio_group_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i;
-
-       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
-               return;
-       }
-
-       i = get_vfio_group_idx(vfio_group_fd);
-       if (i < 0 || i > (VFIO_MAX_GROUPS - 1))
-               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
-       else
-               vfio_cfg->vfio_groups[i].devices--;
-}
-
-static int
-vfio_group_device_count(int vfio_group_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i;
-
-       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
-               return -1;
-       }
-
-       i = get_vfio_group_idx(vfio_group_fd);
-       if (i < 0 || i > (VFIO_MAX_GROUPS - 1)) {
-               RTE_LOG(ERR, EAL, "  wrong vfio_group index (%d)\n", i);
-               return -1;
-       }
-
-       return vfio_cfg->vfio_groups[i].devices;
-}
-
-static void
-vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len,
-               void *arg __rte_unused)
-{
-       struct rte_memseg_list *msl;
-       struct rte_memseg *ms;
-       size_t cur_len = 0;
-
-       msl = rte_mem_virt2memseg_list(addr);
-
-       /* for IOVA as VA mode, no need to care for IOVA addresses */
-       if (rte_eal_iova_mode() == RTE_IOVA_VA && msl->external == 0) {
-               uint64_t vfio_va = (uint64_t)(uintptr_t)addr;
-               if (type == RTE_MEM_EVENT_ALLOC)
-                       vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
-                                       len, 1);
-               else
-                       vfio_dma_mem_map(default_vfio_cfg, vfio_va, vfio_va,
-                                       len, 0);
-               return;
-       }
-
-       /* memsegs are contiguous in memory */
-       ms = rte_mem_virt2memseg(addr, msl);
-       while (cur_len < len) {
-               /* some memory segments may have invalid IOVA */
-               if (ms->iova == RTE_BAD_IOVA) {
-                       RTE_LOG(DEBUG, EAL, "Memory segment at %p has bad IOVA, skipping\n",
-                                       ms->addr);
-                       goto next;
-               }
-               if (type == RTE_MEM_EVENT_ALLOC)
-                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
-                                       ms->iova, ms->len, 1);
-               else
-                       vfio_dma_mem_map(default_vfio_cfg, ms->addr_64,
-                                       ms->iova, ms->len, 0);
-next:
-               cur_len += ms->len;
-               ++ms;
-       }
-}
-
-static int
-vfio_sync_default_container(void)
-{
-       struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
-       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
-       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-       int iommu_type_id;
-       unsigned int i;
-
-       /* cannot be called from primary */
-       if (rte_eal_process_type() != RTE_PROC_SECONDARY)
-               return -1;
-
-       /* default container fd should have been opened in rte_vfio_enable() */
-       if (!default_vfio_cfg->vfio_enabled ||
-                       default_vfio_cfg->vfio_container_fd < 0) {
-               RTE_LOG(ERR, EAL, "VFIO support is not initialized\n");
-               return -1;
-       }
-
-       /* find default container's IOMMU type */
-       p->req = SOCKET_REQ_IOMMU_TYPE;
-       strcpy(mp_req.name, EAL_VFIO_MP);
-       mp_req.len_param = sizeof(*p);
-       mp_req.num_fds = 0;
-
-       iommu_type_id = -1;
-       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
-                       mp_reply.nb_received == 1) {
-               mp_rep = &mp_reply.msgs[0];
-               p = (struct vfio_mp_param *)mp_rep->param;
-               if (p->result == SOCKET_OK)
-                       iommu_type_id = p->iommu_type_id;
-               free(mp_reply.msgs);
-       }
-       if (iommu_type_id < 0) {
-               RTE_LOG(ERR, EAL, "Could not get IOMMU type for default container\n");
-               return -1;
-       }
-
-       /* we now have an fd for default container, as well as its IOMMU type.
-        * now, set up default VFIO container config to match.
-        */
-       for (i = 0; i < RTE_DIM(iommu_types); i++) {
-               const struct vfio_iommu_type *t = &iommu_types[i];
-               if (t->type_id != iommu_type_id)
-                       continue;
-
-               /* we found our IOMMU type */
-               default_vfio_cfg->vfio_iommu_type = t;
-
-               return 0;
-       }
-       RTE_LOG(ERR, EAL, "Could not find IOMMU type id (%i)\n",
-                       iommu_type_id);
-       return -1;
-}
-
-int
-rte_vfio_clear_group(int vfio_group_fd)
-{
-       int i;
-       struct vfio_config *vfio_cfg;
-
-       vfio_cfg = get_vfio_cfg_by_group_fd(vfio_group_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "  invalid group fd!\n");
-               return -1;
-       }
-
-       i = get_vfio_group_idx(vfio_group_fd);
-       if (i < 0)
-               return -1;
-       vfio_cfg->vfio_groups[i].group_num = -1;
-       vfio_cfg->vfio_groups[i].fd = -1;
-       vfio_cfg->vfio_groups[i].devices = 0;
-       vfio_cfg->vfio_active_groups--;
-
-       return 0;
-}
-
-int
-rte_vfio_setup_device(const char *sysfs_base, const char *dev_addr,
-               int *vfio_dev_fd, struct vfio_device_info *device_info)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
-       struct vfio_group_status group_status = {
-                       .argsz = sizeof(group_status)
-       };
-       struct vfio_config *vfio_cfg;
-       struct user_mem_maps *user_mem_maps;
-       int vfio_container_fd;
-       int vfio_group_fd;
-       int iommu_group_num;
-       int i, ret;
-
-       /* get group number */
-       ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
-       if (ret == 0) {
-               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
-                       dev_addr);
-               return 1;
-       }
-
-       /* if negative, something failed */
-       if (ret < 0)
-               return -1;
-
-       /* get the actual group fd */
-       vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
-       if (vfio_group_fd < 0)
-               return -1;
-
-       /* if group_fd == 0, that means the device isn't managed by VFIO */
-       if (vfio_group_fd == 0) {
-               RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
-                               dev_addr);
-               return 1;
-       }
-
-       /*
-        * at this point, we know that this group is viable (meaning, all devices
-        * are either bound to VFIO or not bound to anything)
-        */
-
-       /* check if the group is viable */
-       ret = ioctl(vfio_group_fd, VFIO_GROUP_GET_STATUS, &group_status);
-       if (ret) {
-               RTE_LOG(ERR, EAL, "  %s cannot get group status, "
-                               "error %i (%s)\n", dev_addr, errno, strerror(errno));
-               close(vfio_group_fd);
-               rte_vfio_clear_group(vfio_group_fd);
-               return -1;
-       } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
-               RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", dev_addr);
-               close(vfio_group_fd);
-               rte_vfio_clear_group(vfio_group_fd);
-               return -1;
-       }
-
-       /* get the vfio_config it belongs to */
-       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
-       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-       vfio_container_fd = vfio_cfg->vfio_container_fd;
-       user_mem_maps = &vfio_cfg->mem_maps;
-
-       /* check if group does not have a container yet */
-       if (!(group_status.flags & VFIO_GROUP_FLAGS_CONTAINER_SET)) {
-
-               /* add group to a container */
-               ret = ioctl(vfio_group_fd, VFIO_GROUP_SET_CONTAINER,
-                               &vfio_container_fd);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
-                                       "error %i (%s)\n", dev_addr, errno, strerror(errno));
-                       close(vfio_group_fd);
-                       rte_vfio_clear_group(vfio_group_fd);
-                       return -1;
-               }
-
-               /*
-                * pick an IOMMU type and set up DMA mappings for container
-                *
-                * needs to be done only once, only when first group is
-                * assigned to a container and only in primary process.
-                * Note this can happen several times with the hotplug
-                * functionality.
-                */
-               if (internal_config.process_type == RTE_PROC_PRIMARY &&
-                               vfio_cfg->vfio_active_groups == 1 &&
-                               vfio_group_device_count(vfio_group_fd) == 0) {
-                       const struct vfio_iommu_type *t;
-
-                       /* select an IOMMU type which we will be using */
-                       t = vfio_set_iommu_type(vfio_container_fd);
-                       if (!t) {
-                               RTE_LOG(ERR, EAL,
-                                       "  %s failed to select IOMMU type\n",
-                                       dev_addr);
-                               close(vfio_group_fd);
-                               rte_vfio_clear_group(vfio_group_fd);
-                               return -1;
-                       }
-                       /* lock memory hotplug before mapping and release it
-                        * after registering callback, to prevent races
-                        */
-                       rte_rwlock_read_lock(mem_lock);
-                       if (vfio_cfg == default_vfio_cfg)
-                               ret = t->dma_map_func(vfio_container_fd);
-                       else
-                               ret = 0;
-                       if (ret) {
-                               RTE_LOG(ERR, EAL,
-                                       "  %s DMA remapping failed, error %i (%s)\n",
-                                       dev_addr, errno, strerror(errno));
-                               close(vfio_group_fd);
-                               rte_vfio_clear_group(vfio_group_fd);
-                               rte_rwlock_read_unlock(mem_lock);
-                               return -1;
-                       }
-
-                       vfio_cfg->vfio_iommu_type = t;
-
-                       /* re-map all user-mapped segments */
-                       rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
-                       /* this IOMMU type may not support DMA mapping, but
-                        * if we have mappings in the list - that means we have
-                        * previously mapped something successfully, so we can
-                        * be sure that DMA mapping is supported.
-                        */
-                       for (i = 0; i < user_mem_maps->n_maps; i++) {
-                               struct user_mem_map *map;
-                               map = &user_mem_maps->maps[i];
-
-                               ret = t->dma_user_map_func(
-                                               vfio_container_fd,
-                                               map->addr, map->iova, map->len,
-                                               1);
-                               if (ret) {
-                                       RTE_LOG(ERR, EAL, "Couldn't map user memory for DMA: "
-                                                       "va: 0x%" PRIx64 " "
-                                                       "iova: 0x%" PRIx64 " "
-                                                       "len: 0x%" PRIu64 "\n",
-                                                       map->addr, map->iova,
-                                                       map->len);
-                                       rte_spinlock_recursive_unlock(
-                                                       &user_mem_maps->lock);
-                                       rte_rwlock_read_unlock(mem_lock);
-                                       return -1;
-                               }
-                       }
-                       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-
-                       /* register callback for mem events */
-                       if (vfio_cfg == default_vfio_cfg)
-                               ret = rte_mem_event_callback_register(
-                                       VFIO_MEM_EVENT_CLB_NAME,
-                                       vfio_mem_event_callback, NULL);
-                       else
-                               ret = 0;
-                       /* unlock memory hotplug */
-                       rte_rwlock_read_unlock(mem_lock);
-
-                       if (ret && rte_errno != ENOTSUP) {
-                               RTE_LOG(ERR, EAL, "Could not install memory event callback for VFIO\n");
-                               return -1;
-                       }
-                       if (ret)
-                               RTE_LOG(DEBUG, EAL, "Memory event callbacks not supported\n");
-                       else
-                               RTE_LOG(DEBUG, EAL, "Installed memory event callback for VFIO\n");
-               }
-       } else if (rte_eal_process_type() != RTE_PROC_PRIMARY &&
-                       vfio_cfg == default_vfio_cfg &&
-                       vfio_cfg->vfio_iommu_type == NULL) {
-               /* if we're not a primary process, we do not set up the VFIO
-                * container because it's already been set up by the primary
-                * process. instead, we simply ask the primary about VFIO type
-                * we are using, and set the VFIO config up appropriately.
-                */
-               ret = vfio_sync_default_container();
-               if (ret < 0) {
-                       RTE_LOG(ERR, EAL, "Could not sync default VFIO container\n");
-                       close(vfio_group_fd);
-                       rte_vfio_clear_group(vfio_group_fd);
-                       return -1;
-               }
-               /* we have successfully initialized VFIO, notify user */
-               const struct vfio_iommu_type *t =
-                               default_vfio_cfg->vfio_iommu_type;
-               RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
-                               t->type_id, t->name);
-       }
-
-       /* get a file descriptor for the device */
-       *vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
-       if (*vfio_dev_fd < 0) {
-               /* if we cannot get a device fd, this implies a problem with
-                * the VFIO group or the container not having IOMMU configured.
-                */
-
-               RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
-                               dev_addr);
-               close(vfio_group_fd);
-               rte_vfio_clear_group(vfio_group_fd);
-               return -1;
-       }
-
-       /* test and setup the device */
-       ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
-       if (ret) {
-               RTE_LOG(ERR, EAL, "  %s cannot get device info, "
-                               "error %i (%s)\n", dev_addr, errno,
-                               strerror(errno));
-               close(*vfio_dev_fd);
-               close(vfio_group_fd);
-               rte_vfio_clear_group(vfio_group_fd);
-               return -1;
-       }
-       vfio_group_device_get(vfio_group_fd);
-
-       return 0;
-}
-
-int
-rte_vfio_release_device(const char *sysfs_base, const char *dev_addr,
-                   int vfio_dev_fd)
-{
-       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
-       rte_rwlock_t *mem_lock = &mcfg->memory_hotplug_lock;
-       struct vfio_group_status group_status = {
-                       .argsz = sizeof(group_status)
-       };
-       struct vfio_config *vfio_cfg;
-       int vfio_group_fd;
-       int iommu_group_num;
-       int ret;
-
-       /* we don't want any DMA mapping messages to come while we're detaching
-        * VFIO device, because this might be the last device and we might need
-        * to unregister the callback.
-        */
-       rte_rwlock_read_lock(mem_lock);
-
-       /* get group number */
-       ret = rte_vfio_get_group_num(sysfs_base, dev_addr, &iommu_group_num);
-       if (ret <= 0) {
-               RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver\n",
-                       dev_addr);
-               /* This is an error at this point. */
-               ret = -1;
-               goto out;
-       }
-
-       /* get the actual group fd */
-       vfio_group_fd = rte_vfio_get_group_fd(iommu_group_num);
-       if (vfio_group_fd <= 0) {
-               RTE_LOG(INFO, EAL, "rte_vfio_get_group_fd failed for %s\n",
-                                  dev_addr);
-               ret = -1;
-               goto out;
-       }
-
-       /* get the vfio_config it belongs to */
-       vfio_cfg = get_vfio_cfg_by_group_num(iommu_group_num);
-       vfio_cfg = vfio_cfg ? vfio_cfg : default_vfio_cfg;
-
-       /* At this point we got an active group. Closing it will make the
-        * container detachment. If this is the last active group, VFIO kernel
-        * code will unset the container and the IOMMU mappings.
-        */
-
-       /* Closing a device */
-       if (close(vfio_dev_fd) < 0) {
-               RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
-                                  dev_addr);
-               ret = -1;
-               goto out;
-       }
-
-       /* An VFIO group can have several devices attached. Just when there is
-        * no devices remaining should the group be closed.
-        */
-       vfio_group_device_put(vfio_group_fd);
-       if (!vfio_group_device_count(vfio_group_fd)) {
-
-               if (close(vfio_group_fd) < 0) {
-                       RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
-                               dev_addr);
-                       ret = -1;
-                       goto out;
-               }
-
-               if (rte_vfio_clear_group(vfio_group_fd) < 0) {
-                       RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
-                                          dev_addr);
-                       ret = -1;
-                       goto out;
-               }
-       }
-
-       /* if there are no active device groups, unregister the callback to
-        * avoid spurious attempts to map/unmap memory from VFIO.
-        */
-       if (vfio_cfg == default_vfio_cfg && vfio_cfg->vfio_active_groups == 0 &&
-                       rte_eal_process_type() != RTE_PROC_SECONDARY)
-               rte_mem_event_callback_unregister(VFIO_MEM_EVENT_CLB_NAME,
-                               NULL);
-
-       /* success */
-       ret = 0;
-
-out:
-       rte_rwlock_read_unlock(mem_lock);
-       return ret;
-}
-
-int
-rte_vfio_enable(const char *modname)
-{
-       /* initialize group list */
-       int i, j;
-       int vfio_available;
-
-       rte_spinlock_recursive_t lock = RTE_SPINLOCK_RECURSIVE_INITIALIZER;
-
-       for (i = 0; i < VFIO_MAX_CONTAINERS; i++) {
-               vfio_cfgs[i].vfio_container_fd = -1;
-               vfio_cfgs[i].vfio_active_groups = 0;
-               vfio_cfgs[i].vfio_iommu_type = NULL;
-               vfio_cfgs[i].mem_maps.lock = lock;
-
-               for (j = 0; j < VFIO_MAX_GROUPS; j++) {
-                       vfio_cfgs[i].vfio_groups[j].fd = -1;
-                       vfio_cfgs[i].vfio_groups[j].group_num = -1;
-                       vfio_cfgs[i].vfio_groups[j].devices = 0;
-               }
-       }
-
-       /* inform the user that we are probing for VFIO */
-       RTE_LOG(INFO, EAL, "Probing VFIO support...\n");
-
-       /* check if vfio module is loaded */
-       vfio_available = rte_eal_check_module(modname);
-
-       /* return error directly */
-       if (vfio_available == -1) {
-               RTE_LOG(INFO, EAL, "Could not get loaded module details!\n");
-               return -1;
-       }
-
-       /* return 0 if VFIO modules not loaded */
-       if (vfio_available == 0) {
-               RTE_LOG(DEBUG, EAL, "VFIO modules not loaded, "
-                       "skipping VFIO support...\n");
-               return 0;
-       }
-
-       if (internal_config.process_type == RTE_PROC_PRIMARY) {
-               /* open a new container */
-               default_vfio_cfg->vfio_container_fd =
-                               rte_vfio_get_container_fd();
-       } else {
-               /* get the default container from the primary process */
-               default_vfio_cfg->vfio_container_fd =
-                               vfio_get_default_container_fd();
-       }
-
-       /* check if we have VFIO driver enabled */
-       if (default_vfio_cfg->vfio_container_fd != -1) {
-               RTE_LOG(NOTICE, EAL, "VFIO support initialized\n");
-               default_vfio_cfg->vfio_enabled = 1;
-       } else {
-               RTE_LOG(NOTICE, EAL, "VFIO support could not be initialized\n");
-       }
-
-       return 0;
-}
-
-int
-rte_vfio_is_enabled(const char *modname)
-{
-       const int mod_available = rte_eal_check_module(modname) > 0;
-       return default_vfio_cfg->vfio_enabled && mod_available;
-}
-
-int
-vfio_get_default_container_fd(void)
-{
-       struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
-       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
-       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
-       if (default_vfio_cfg->vfio_enabled)
-               return default_vfio_cfg->vfio_container_fd;
-
-       if (internal_config.process_type == RTE_PROC_PRIMARY) {
-               /* if we were secondary process we would try requesting
-                * container fd from the primary, but we're the primary
-                * process so just exit here
-                */
-               return -1;
-       }
-
-       p->req = SOCKET_REQ_DEFAULT_CONTAINER;
-       strcpy(mp_req.name, EAL_VFIO_MP);
-       mp_req.len_param = sizeof(*p);
-       mp_req.num_fds = 0;
-
-       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
-           mp_reply.nb_received == 1) {
-               mp_rep = &mp_reply.msgs[0];
-               p = (struct vfio_mp_param *)mp_rep->param;
-               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
-                       free(mp_reply.msgs);
-                       return mp_rep->fds[0];
-               }
-               free(mp_reply.msgs);
-       }
-
-       RTE_LOG(ERR, EAL, "  cannot request default container fd\n");
-       return -1;
-}
-
-int
-vfio_get_iommu_type(void)
-{
-       if (default_vfio_cfg->vfio_iommu_type == NULL)
-               return -1;
-
-       return default_vfio_cfg->vfio_iommu_type->type_id;
-}
-
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd)
-{
-       unsigned idx;
-       for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
-               const struct vfio_iommu_type *t = &iommu_types[idx];
-
-               int ret = ioctl(vfio_container_fd, VFIO_SET_IOMMU,
-                               t->type_id);
-               if (!ret) {
-                       RTE_LOG(NOTICE, EAL, "  using IOMMU type %d (%s)\n",
-                                       t->type_id, t->name);
-                       return t;
-               }
-               /* not an error, there may be more supported IOMMU types */
-               RTE_LOG(DEBUG, EAL, "  set IOMMU type %d (%s) failed, "
-                               "error %i (%s)\n", t->type_id, t->name, errno,
-                               strerror(errno));
-       }
-       /* if we didn't find a suitable IOMMU type, fail */
-       return NULL;
-}
-
-int
-vfio_has_supported_extensions(int vfio_container_fd)
-{
-       int ret;
-       unsigned idx, n_extensions = 0;
-       for (idx = 0; idx < RTE_DIM(iommu_types); idx++) {
-               const struct vfio_iommu_type *t = &iommu_types[idx];
-
-               ret = ioctl(vfio_container_fd, VFIO_CHECK_EXTENSION,
-                               t->type_id);
-               if (ret < 0) {
-                       RTE_LOG(ERR, EAL, "  could not get IOMMU type, "
-                               "error %i (%s)\n", errno,
-                               strerror(errno));
-                       close(vfio_container_fd);
-                       return -1;
-               } else if (ret == 1) {
-                       /* we found a supported extension */
-                       n_extensions++;
-               }
-               RTE_LOG(DEBUG, EAL, "  IOMMU type %d (%s) is %s\n",
-                               t->type_id, t->name,
-                               ret ? "supported" : "not supported");
-       }
-
-       /* if we didn't find any supported IOMMU types, fail */
-       if (!n_extensions) {
-               close(vfio_container_fd);
-               return -1;
-       }
-
-       return 0;
-}
-
-int
-rte_vfio_get_container_fd(void)
-{
-       int ret, vfio_container_fd;
-       struct rte_mp_msg mp_req, *mp_rep;
-       struct rte_mp_reply mp_reply;
-       struct timespec ts = {.tv_sec = 5, .tv_nsec = 0};
-       struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param;
-
-
-       /* if we're in a primary process, try to open the container */
-       if (internal_config.process_type == RTE_PROC_PRIMARY) {
-               vfio_container_fd = open(VFIO_CONTAINER_PATH, O_RDWR);
-               if (vfio_container_fd < 0) {
-                       RTE_LOG(ERR, EAL, "  cannot open VFIO container, "
-                                       "error %i (%s)\n", errno, strerror(errno));
-                       return -1;
-               }
-
-               /* check VFIO API version */
-               ret = ioctl(vfio_container_fd, VFIO_GET_API_VERSION);
-               if (ret != VFIO_API_VERSION) {
-                       if (ret < 0)
-                               RTE_LOG(ERR, EAL, "  could not get VFIO API version, "
-                                               "error %i (%s)\n", errno, strerror(errno));
-                       else
-                               RTE_LOG(ERR, EAL, "  unsupported VFIO API version!\n");
-                       close(vfio_container_fd);
-                       return -1;
-               }
-
-               ret = vfio_has_supported_extensions(vfio_container_fd);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  no supported IOMMU "
-                                       "extensions found!\n");
-                       return -1;
-               }
-
-               return vfio_container_fd;
-       }
-       /*
-        * if we're in a secondary process, request container fd from the
-        * primary process via mp channel
-        */
-       p->req = SOCKET_REQ_CONTAINER;
-       strcpy(mp_req.name, EAL_VFIO_MP);
-       mp_req.len_param = sizeof(*p);
-       mp_req.num_fds = 0;
-
-       vfio_container_fd = -1;
-       if (rte_mp_request_sync(&mp_req, &mp_reply, &ts) == 0 &&
-           mp_reply.nb_received == 1) {
-               mp_rep = &mp_reply.msgs[0];
-               p = (struct vfio_mp_param *)mp_rep->param;
-               if (p->result == SOCKET_OK && mp_rep->num_fds == 1) {
-                       vfio_container_fd = mp_rep->fds[0];
-                       free(mp_reply.msgs);
-                       return vfio_container_fd;
-               }
-               free(mp_reply.msgs);
-       }
-
-       RTE_LOG(ERR, EAL, "  cannot request container fd\n");
-       return -1;
-}
-
-int
-rte_vfio_get_group_num(const char *sysfs_base,
-               const char *dev_addr, int *iommu_group_num)
-{
-       char linkname[PATH_MAX];
-       char filename[PATH_MAX];
-       char *tok[16], *group_tok, *end;
-       int ret;
-
-       memset(linkname, 0, sizeof(linkname));
-       memset(filename, 0, sizeof(filename));
-
-       /* try to find out IOMMU group for this device */
-       snprintf(linkname, sizeof(linkname),
-                        "%s/%s/iommu_group", sysfs_base, dev_addr);
-
-       ret = readlink(linkname, filename, sizeof(filename));
-
-       /* if the link doesn't exist, no VFIO for us */
-       if (ret < 0)
-               return 0;
-
-       ret = rte_strsplit(filename, sizeof(filename),
-                       tok, RTE_DIM(tok), '/');
-
-       if (ret <= 0) {
-               RTE_LOG(ERR, EAL, "  %s cannot get IOMMU group\n", dev_addr);
-               return -1;
-       }
-
-       /* IOMMU group is always the last token */
-       errno = 0;
-       group_tok = tok[ret - 1];
-       end = group_tok;
-       *iommu_group_num = strtol(group_tok, &end, 10);
-       if ((end != group_tok && *end != '\0') || errno != 0) {
-               RTE_LOG(ERR, EAL, "  %s error parsing IOMMU number!\n", dev_addr);
-               return -1;
-       }
-
-       return 1;
-}
-
-static int
-type1_map(const struct rte_memseg_list *msl, const struct rte_memseg *ms,
-               void *arg)
-{
-       int *vfio_container_fd = arg;
-
-       if (msl->external)
-               return 0;
-
-       return vfio_type1_dma_mem_map(*vfio_container_fd, ms->addr_64, ms->iova,
-                       ms->len, 1);
-}
-
-static int
-vfio_type1_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map)
-{
-       struct vfio_iommu_type1_dma_map dma_map;
-       struct vfio_iommu_type1_dma_unmap dma_unmap;
-       int ret;
-
-       if (do_map != 0) {
-               memset(&dma_map, 0, sizeof(dma_map));
-               dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-               dma_map.vaddr = vaddr;
-               dma_map.size = len;
-               dma_map.iova = iova;
-               dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
-                               VFIO_DMA_MAP_FLAG_WRITE;
-
-               ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
-                               errno, strerror(errno));
-                               return -1;
-               }
-       } else {
-               memset(&dma_unmap, 0, sizeof(dma_unmap));
-               dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
-               dma_unmap.size = len;
-               dma_unmap.iova = iova;
-
-               ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
-                               &dma_unmap);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-static int
-vfio_type1_dma_map(int vfio_container_fd)
-{
-       return rte_memseg_walk(type1_map, &vfio_container_fd);
-}
-
-static int
-vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map)
-{
-       struct vfio_iommu_type1_dma_map dma_map;
-       struct vfio_iommu_type1_dma_unmap dma_unmap;
-       int ret;
-       struct vfio_iommu_spapr_register_memory reg = {
-               .argsz = sizeof(reg),
-               .flags = 0
-       };
-       reg.vaddr = (uintptr_t) vaddr;
-       reg.size = len;
-
-       if (do_map != 0) {
-               ret = ioctl(vfio_container_fd,
-                               VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
-                               "error %i (%s)\n", errno, strerror(errno));
-                       return -1;
-               }
-
-               memset(&dma_map, 0, sizeof(dma_map));
-               dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
-               dma_map.vaddr = vaddr;
-               dma_map.size = len;
-               dma_map.iova = iova;
-               dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
-                               VFIO_DMA_MAP_FLAG_WRITE;
-
-               ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, error %i (%s)\n",
-                               errno, strerror(errno));
-                               return -1;
-               }
-
-       } else {
-               ret = ioctl(vfio_container_fd,
-                               VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot unregister vaddr for IOMMU, error %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-
-               memset(&dma_unmap, 0, sizeof(dma_unmap));
-               dma_unmap.argsz = sizeof(struct vfio_iommu_type1_dma_unmap);
-               dma_unmap.size = len;
-               dma_unmap.iova = iova;
-
-               ret = ioctl(vfio_container_fd, VFIO_IOMMU_UNMAP_DMA,
-                               &dma_unmap);
-               if (ret) {
-                       RTE_LOG(ERR, EAL, "  cannot clear DMA remapping, error %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-       }
-
-       return 0;
-}
-
-static int
-vfio_spapr_map_walk(const struct rte_memseg_list *msl,
-               const struct rte_memseg *ms, void *arg)
-{
-       int *vfio_container_fd = arg;
-
-       if (msl->external)
-               return 0;
-
-       return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova,
-                       ms->len, 1);
-}
-
-struct spapr_walk_param {
-       uint64_t window_size;
-       uint64_t hugepage_sz;
-};
-static int
-vfio_spapr_window_size_walk(const struct rte_memseg_list *msl,
-               const struct rte_memseg *ms, void *arg)
-{
-       struct spapr_walk_param *param = arg;
-       uint64_t max = ms->iova + ms->len;
-
-       if (msl->external)
-               return 0;
-
-       if (max > param->window_size) {
-               param->hugepage_sz = ms->hugepage_sz;
-               param->window_size = max;
-       }
-
-       return 0;
-}
-
-static int
-vfio_spapr_create_new_dma_window(int vfio_container_fd,
-               struct vfio_iommu_spapr_tce_create *create) {
-       struct vfio_iommu_spapr_tce_remove remove = {
-               .argsz = sizeof(remove),
-       };
-       struct vfio_iommu_spapr_tce_info info = {
-               .argsz = sizeof(info),
-       };
-       int ret;
-
-       /* query spapr iommu info */
-       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
-       if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot get iommu info, "
-                               "error %i (%s)\n", errno, strerror(errno));
-               return -1;
-       }
-
-       /* remove default DMA of 32 bit window */
-       remove.start_addr = info.dma32_window_start;
-       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
-       if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
-                               "error %i (%s)\n", errno, strerror(errno));
-               return -1;
-       }
-
-       /* create new DMA window */
-       ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, create);
-       if (ret) {
-               RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
-                               "error %i (%s)\n", errno, strerror(errno));
-               return -1;
-       }
-
-       if (create->start_addr != 0) {
-               RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
-               return -1;
-       }
-
-       return 0;
-}
-
-static int
-vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map)
-{
-       struct spapr_walk_param param;
-       struct vfio_iommu_spapr_tce_create create = {
-               .argsz = sizeof(create),
-       };
-       struct vfio_config *vfio_cfg;
-       struct user_mem_maps *user_mem_maps;
-       int i, ret = 0;
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(vfio_container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "  invalid container fd!\n");
-               return -1;
-       }
-
-       user_mem_maps = &vfio_cfg->mem_maps;
-       rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
-       /* check if window size needs to be adjusted */
-       memset(&param, 0, sizeof(param));
-
-       /* we're inside a callback so use thread-unsafe version */
-       if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk,
-                               &param) < 0) {
-               RTE_LOG(ERR, EAL, "Could not get window size\n");
-               ret = -1;
-               goto out;
-       }
-
-       /* also check user maps */
-       for (i = 0; i < user_mem_maps->n_maps; i++) {
-               uint64_t max = user_mem_maps->maps[i].iova +
-                               user_mem_maps->maps[i].len;
-               create.window_size = RTE_MAX(create.window_size, max);
-       }
-
-       /* sPAPR requires window size to be a power of 2 */
-       create.window_size = rte_align64pow2(param.window_size);
-       create.page_shift = __builtin_ctzll(param.hugepage_sz);
-       create.levels = 1;
-
-       if (do_map) {
-               void *addr;
-               /* re-create window and remap the entire memory */
-               if (iova > create.window_size) {
-                       if (vfio_spapr_create_new_dma_window(vfio_container_fd,
-                                       &create) < 0) {
-                               RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
-                               ret = -1;
-                               goto out;
-                       }
-                       /* we're inside a callback, so use thread-unsafe version
-                        */
-                       if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk,
-                                       &vfio_container_fd) < 0) {
-                               RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
-                               ret = -1;
-                               goto out;
-                       }
-                       /* remap all user maps */
-                       for (i = 0; i < user_mem_maps->n_maps; i++) {
-                               struct user_mem_map *map =
-                                               &user_mem_maps->maps[i];
-                               if (vfio_spapr_dma_do_map(vfio_container_fd,
-                                               map->addr, map->iova, map->len,
-                                               1)) {
-                                       RTE_LOG(ERR, EAL, "Could not recreate user DMA maps\n");
-                                       ret = -1;
-                                       goto out;
-                               }
-                       }
-               }
-
-               /* now that we've remapped all of the memory that was present
-                * before, map the segment that we were requested to map.
-                *
-                * however, if we were called by the callback, the memory we
-                * were called with was already in the memseg list, so previous
-                * mapping should've mapped that segment already.
-                *
-                * virt2memseg_list is a relatively cheap check, so use that. if
-                * memory is within any memseg list, it's a memseg, so it's
-                * already mapped.
-                */
-               addr = (void *)(uintptr_t)vaddr;
-               if (rte_mem_virt2memseg_list(addr) == NULL &&
-                               vfio_spapr_dma_do_map(vfio_container_fd,
-                                       vaddr, iova, len, 1) < 0) {
-                       RTE_LOG(ERR, EAL, "Could not map segment\n");
-                       ret = -1;
-                       goto out;
-               }
-       } else {
-               /* for unmap, check if iova within DMA window */
-               if (iova > create.window_size) {
-                       RTE_LOG(ERR, EAL, "iova beyond DMA window for unmap");
-                       ret = -1;
-                       goto out;
-               }
-
-               vfio_spapr_dma_do_map(vfio_container_fd, vaddr, iova, len, 0);
-       }
-out:
-       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-       return ret;
-}
-
-static int
-vfio_spapr_dma_map(int vfio_container_fd)
-{
-       struct vfio_iommu_spapr_tce_create create = {
-               .argsz = sizeof(create),
-       };
-       struct spapr_walk_param param;
-
-       memset(&param, 0, sizeof(param));
-
-       /* create DMA window from 0 to max(phys_addr + len) */
-       rte_memseg_walk(vfio_spapr_window_size_walk, &param);
-
-       /* sPAPR requires window size to be a power of 2 */
-       create.window_size = rte_align64pow2(param.window_size);
-       create.page_shift = __builtin_ctzll(param.hugepage_sz);
-       create.levels = 1;
-
-       if (vfio_spapr_create_new_dma_window(vfio_container_fd, &create) < 0) {
-               RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
-               return -1;
-       }
-
-       /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
-       if (rte_memseg_walk(vfio_spapr_map_walk, &vfio_container_fd) < 0)
-               return -1;
-
-       return 0;
-}
-
-static int
-vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
-{
-       /* No-IOMMU mode does not need DMA mapping */
-       return 0;
-}
-
-static int
-vfio_noiommu_dma_mem_map(int __rte_unused vfio_container_fd,
-                        uint64_t __rte_unused vaddr,
-                        uint64_t __rte_unused iova, uint64_t __rte_unused len,
-                        int __rte_unused do_map)
-{
-       /* No-IOMMU mode does not need DMA mapping */
-       return 0;
-}
-
-static int
-vfio_dma_mem_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map)
-{
-       const struct vfio_iommu_type *t = vfio_cfg->vfio_iommu_type;
-
-       if (!t) {
-               RTE_LOG(ERR, EAL, "  VFIO support not initialized\n");
-               rte_errno = ENODEV;
-               return -1;
-       }
-
-       if (!t->dma_user_map_func) {
-               RTE_LOG(ERR, EAL,
-                       "  VFIO custom DMA region maping not supported by IOMMU %s\n",
-                       t->name);
-               rte_errno = ENOTSUP;
-               return -1;
-       }
-
-       return t->dma_user_map_func(vfio_cfg->vfio_container_fd, vaddr, iova,
-                       len, do_map);
-}
-
-static int
-container_dma_map(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
-               uint64_t len)
-{
-       struct user_mem_map *new_map;
-       struct user_mem_maps *user_mem_maps;
-       int ret = 0;
-
-       user_mem_maps = &vfio_cfg->mem_maps;
-       rte_spinlock_recursive_lock(&user_mem_maps->lock);
-       if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-               RTE_LOG(ERR, EAL, "No more space for user mem maps\n");
-               rte_errno = ENOMEM;
-               ret = -1;
-               goto out;
-       }
-       /* map the entry */
-       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 1)) {
-               /* technically, this will fail if there are currently no devices
-                * plugged in, even if a device were added later, this mapping
-                * might have succeeded. however, since we cannot verify if this
-                * is a valid mapping without having a device attached, consider
-                * this to be unsupported, because we can't just store any old
-                * mapping and pollute list of active mappings willy-nilly.
-                */
-               RTE_LOG(ERR, EAL, "Couldn't map new region for DMA\n");
-               ret = -1;
-               goto out;
-       }
-       /* create new user mem map entry */
-       new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
-       new_map->addr = vaddr;
-       new_map->iova = iova;
-       new_map->len = len;
-
-       compact_user_maps(user_mem_maps);
-out:
-       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-       return ret;
-}
-
-static int
-container_dma_unmap(struct vfio_config *vfio_cfg, uint64_t vaddr, uint64_t iova,
-               uint64_t len)
-{
-       struct user_mem_map *map, *new_map = NULL;
-       struct user_mem_maps *user_mem_maps;
-       int ret = 0;
-
-       user_mem_maps = &vfio_cfg->mem_maps;
-       rte_spinlock_recursive_lock(&user_mem_maps->lock);
-
-       /* find our mapping */
-       map = find_user_mem_map(user_mem_maps, vaddr, iova, len);
-       if (!map) {
-               RTE_LOG(ERR, EAL, "Couldn't find previously mapped region\n");
-               rte_errno = EINVAL;
-               ret = -1;
-               goto out;
-       }
-       if (map->addr != vaddr || map->iova != iova || map->len != len) {
-               /* we're partially unmapping a previously mapped region, so we
-                * need to split entry into two.
-                */
-               if (user_mem_maps->n_maps == VFIO_MAX_USER_MEM_MAPS) {
-                       RTE_LOG(ERR, EAL, "Not enough space to store partial mapping\n");
-                       rte_errno = ENOMEM;
-                       ret = -1;
-                       goto out;
-               }
-               new_map = &user_mem_maps->maps[user_mem_maps->n_maps++];
-       }
-
-       /* unmap the entry */
-       if (vfio_dma_mem_map(vfio_cfg, vaddr, iova, len, 0)) {
-               /* there may not be any devices plugged in, so unmapping will
-                * fail with ENODEV/ENOTSUP rte_errno values, but that doesn't
-                * stop us from removing the mapping, as the assumption is we
-                * won't be needing this memory any more and thus will want to
-                * prevent it from being remapped again on hotplug. so, only
-                * fail if we indeed failed to unmap (e.g. if the mapping was
-                * within our mapped range but had invalid alignment).
-                */
-               if (rte_errno != ENODEV && rte_errno != ENOTSUP) {
-                       RTE_LOG(ERR, EAL, "Couldn't unmap region for DMA\n");
-                       ret = -1;
-                       goto out;
-               } else {
-                       RTE_LOG(DEBUG, EAL, "DMA unmapping failed, but removing mappings anyway\n");
-               }
-       }
-       /* remove map from the list of active mappings */
-       if (new_map != NULL) {
-               adjust_map(map, new_map, vaddr, len);
-
-               /* if we've created a new map by splitting, sort everything */
-               if (!is_null_map(new_map)) {
-                       compact_user_maps(user_mem_maps);
-               } else {
-                       /* we've created a new mapping, but it was unused */
-                       user_mem_maps->n_maps--;
-               }
-       } else {
-               memset(map, 0, sizeof(*map));
-               compact_user_maps(user_mem_maps);
-               user_mem_maps->n_maps--;
-       }
-
-out:
-       rte_spinlock_recursive_unlock(&user_mem_maps->lock);
-       return ret;
-}
-
-int
-rte_vfio_dma_map(uint64_t vaddr, uint64_t iova, uint64_t len)
-{
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       return container_dma_map(default_vfio_cfg, vaddr, iova, len);
-}
-
-int
-rte_vfio_dma_unmap(uint64_t vaddr, uint64_t iova, uint64_t len)
-{
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       return container_dma_unmap(default_vfio_cfg, vaddr, iova, len);
-}
-
-int
-rte_vfio_noiommu_is_enabled(void)
-{
-       int fd;
-       ssize_t cnt;
-       char c;
-
-       fd = open(VFIO_NOIOMMU_MODE, O_RDONLY);
-       if (fd < 0) {
-               if (errno != ENOENT) {
-                       RTE_LOG(ERR, EAL, "  cannot open vfio noiommu file %i (%s)\n",
-                                       errno, strerror(errno));
-                       return -1;
-               }
-               /*
-                * else the file does not exists
-                * i.e. noiommu is not enabled
-                */
-               return 0;
-       }
-
-       cnt = read(fd, &c, 1);
-       close(fd);
-       if (cnt != 1) {
-               RTE_LOG(ERR, EAL, "  unable to read from vfio noiommu "
-                               "file %i (%s)\n", errno, strerror(errno));
-               return -1;
-       }
-
-       return c == 'Y';
-}
-
-int
-rte_vfio_container_create(void)
-{
-       int i;
-
-       /* Find an empty slot to store new vfio config */
-       for (i = 1; i < VFIO_MAX_CONTAINERS; i++) {
-               if (vfio_cfgs[i].vfio_container_fd == -1)
-                       break;
-       }
-
-       if (i == VFIO_MAX_CONTAINERS) {
-               RTE_LOG(ERR, EAL, "exceed max vfio container limit\n");
-               return -1;
-       }
-
-       vfio_cfgs[i].vfio_container_fd = rte_vfio_get_container_fd();
-       if (vfio_cfgs[i].vfio_container_fd < 0) {
-               RTE_LOG(NOTICE, EAL, "fail to create a new container\n");
-               return -1;
-       }
-
-       return vfio_cfgs[i].vfio_container_fd;
-}
-
-int __rte_experimental
-rte_vfio_container_destroy(int container_fd)
-{
-       struct vfio_config *vfio_cfg;
-       int i;
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "Invalid container fd\n");
-               return -1;
-       }
-
-       for (i = 0; i < VFIO_MAX_GROUPS; i++)
-               if (vfio_cfg->vfio_groups[i].group_num != -1)
-                       rte_vfio_container_group_unbind(container_fd,
-                               vfio_cfg->vfio_groups[i].group_num);
-
-       close(container_fd);
-       vfio_cfg->vfio_container_fd = -1;
-       vfio_cfg->vfio_active_groups = 0;
-       vfio_cfg->vfio_iommu_type = NULL;
-
-       return 0;
-}
-
-int
-rte_vfio_container_group_bind(int container_fd, int iommu_group_num)
-{
-       struct vfio_config *vfio_cfg;
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "Invalid container fd\n");
-               return -1;
-       }
-
-       return vfio_get_group_fd(vfio_cfg, iommu_group_num);
-}
-
-int
-rte_vfio_container_group_unbind(int container_fd, int iommu_group_num)
-{
-       struct vfio_config *vfio_cfg;
-       struct vfio_group *cur_grp = NULL;
-       int i;
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "Invalid container fd\n");
-               return -1;
-       }
-
-       for (i = 0; i < VFIO_MAX_GROUPS; i++) {
-               if (vfio_cfg->vfio_groups[i].group_num == iommu_group_num) {
-                       cur_grp = &vfio_cfg->vfio_groups[i];
-                       break;
-               }
-       }
-
-       /* This should not happen */
-       if (i == VFIO_MAX_GROUPS || cur_grp == NULL) {
-               RTE_LOG(ERR, EAL, "Specified group number not found\n");
-               return -1;
-       }
-
-       if (cur_grp->fd >= 0 && close(cur_grp->fd) < 0) {
-               RTE_LOG(ERR, EAL, "Error when closing vfio_group_fd for"
-                       " iommu_group_num %d\n", iommu_group_num);
-               return -1;
-       }
-       cur_grp->group_num = -1;
-       cur_grp->fd = -1;
-       cur_grp->devices = 0;
-       vfio_cfg->vfio_active_groups--;
-
-       return 0;
-}
-
-int
-rte_vfio_container_dma_map(int container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len)
-{
-       struct vfio_config *vfio_cfg;
-
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "Invalid container fd\n");
-               return -1;
-       }
-
-       return container_dma_map(vfio_cfg, vaddr, iova, len);
-}
-
-int
-rte_vfio_container_dma_unmap(int container_fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len)
-{
-       struct vfio_config *vfio_cfg;
-
-       if (len == 0) {
-               rte_errno = EINVAL;
-               return -1;
-       }
-
-       vfio_cfg = get_vfio_cfg_by_container_fd(container_fd);
-       if (vfio_cfg == NULL) {
-               RTE_LOG(ERR, EAL, "Invalid container fd\n");
-               return -1;
-       }
-
-       return container_dma_unmap(vfio_cfg, vaddr, iova, len);
-}
-
-#else
-
-int
-rte_vfio_dma_map(uint64_t __rte_unused vaddr, __rte_unused uint64_t iova,
-                 __rte_unused uint64_t len)
-{
-       return -1;
-}
-
-int
-rte_vfio_dma_unmap(uint64_t __rte_unused vaddr, uint64_t __rte_unused iova,
-                   __rte_unused uint64_t len)
-{
-       return -1;
-}
-
-int
-rte_vfio_setup_device(__rte_unused const char *sysfs_base,
-               __rte_unused const char *dev_addr,
-               __rte_unused int *vfio_dev_fd,
-               __rte_unused struct vfio_device_info *device_info)
-{
-       return -1;
-}
-
-int
-rte_vfio_release_device(__rte_unused const char *sysfs_base,
-               __rte_unused const char *dev_addr, __rte_unused int fd)
-{
-       return -1;
-}
-
-int
-rte_vfio_enable(__rte_unused const char *modname)
-{
-       return -1;
-}
-
-int
-rte_vfio_is_enabled(__rte_unused const char *modname)
-{
-       return -1;
-}
-
-int
-rte_vfio_noiommu_is_enabled(void)
-{
-       return -1;
-}
-
-int
-rte_vfio_clear_group(__rte_unused int vfio_group_fd)
-{
-       return -1;
-}
-
-int
-rte_vfio_get_group_num(__rte_unused const char *sysfs_base,
-               __rte_unused const char *dev_addr,
-               __rte_unused int *iommu_group_num)
-{
-       return -1;
-}
-
-int
-rte_vfio_get_container_fd(void)
-{
-       return -1;
-}
-
-int
-rte_vfio_get_group_fd(__rte_unused int iommu_group_num)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_create(void)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_destroy(__rte_unused int container_fd)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_group_bind(__rte_unused int container_fd,
-               __rte_unused int iommu_group_num)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_group_unbind(__rte_unused int container_fd,
-               __rte_unused int iommu_group_num)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_dma_map(__rte_unused int container_fd,
-               __rte_unused uint64_t vaddr,
-               __rte_unused uint64_t iova,
-               __rte_unused uint64_t len)
-{
-       return -1;
-}
-
-int
-rte_vfio_container_dma_unmap(__rte_unused int container_fd,
-               __rte_unused uint64_t vaddr,
-               __rte_unused uint64_t iova,
-               __rte_unused uint64_t len)
-{
-       return -1;
-}
-
-#endif /* VFIO_PRESENT */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
deleted file mode 100644 (file)
index cb2d35f..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2014 Intel Corporation
- */
-
-#ifndef EAL_VFIO_H_
-#define EAL_VFIO_H_
-
-#include <rte_common.h>
-
-/*
- * determine if VFIO is present on the system
- */
-#if !defined(VFIO_PRESENT) && defined(RTE_EAL_VFIO)
-#include <linux/version.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 6, 0)
-#define VFIO_PRESENT
-#else
-#pragma message("VFIO configured but not supported by this kernel, disabling.")
-#endif /* kernel version >= 3.6.0 */
-#endif /* RTE_EAL_VFIO */
-
-#ifdef VFIO_PRESENT
-
-#include <stdint.h>
-#include <linux/vfio.h>
-
-#define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
-
-#ifndef VFIO_SPAPR_TCE_v2_IOMMU
-#define RTE_VFIO_SPAPR 7
-#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 17)
-#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
-#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
-#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
-
-struct vfio_iommu_spapr_register_memory {
-       uint32_t argsz;
-       uint32_t flags;
-       uint64_t vaddr;
-       uint64_t size;
-};
-
-struct vfio_iommu_spapr_tce_create {
-       uint32_t argsz;
-       uint32_t flags;
-       /* in */
-       uint32_t page_shift;
-       uint32_t __resv1;
-       uint64_t window_size;
-       uint32_t levels;
-       uint32_t __resv2;
-       /* out */
-       uint64_t start_addr;
-};
-
-struct vfio_iommu_spapr_tce_remove {
-       uint32_t argsz;
-       uint32_t flags;
-       /* in */
-       uint64_t start_addr;
-};
-
-struct vfio_iommu_spapr_tce_ddw_info {
-       uint64_t pgsizes;
-       uint32_t max_dynamic_windows_supported;
-       uint32_t levels;
-};
-
-/* SPAPR_v2 is not present, but SPAPR might be */
-#ifndef VFIO_SPAPR_TCE_IOMMU
-#define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
-
-struct vfio_iommu_spapr_tce_info {
-       uint32_t argsz;
-       uint32_t flags;
-       uint32_t dma32_window_start;
-       uint32_t dma32_window_size;
-       struct vfio_iommu_spapr_tce_ddw_info ddw;
-};
-#endif /* VFIO_SPAPR_TCE_IOMMU */
-
-#else /* VFIO_SPAPR_TCE_v2_IOMMU */
-#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
-#endif
-
-#define VFIO_MAX_GROUPS RTE_MAX_VFIO_GROUPS
-#define VFIO_MAX_CONTAINERS RTE_MAX_VFIO_CONTAINERS
-
-/*
- * we don't need to store device fd's anywhere since they can be obtained from
- * the group fd via an ioctl() call.
- */
-struct vfio_group {
-       int group_num;
-       int fd;
-       int devices;
-};
-
-/* DMA mapping function prototype.
- * Takes VFIO container fd as a parameter.
- * Returns 0 on success, -1 on error.
- * */
-typedef int (*vfio_dma_func_t)(int);
-
-/* Custom memory region DMA mapping function prototype.
- * Takes VFIO container fd, virtual address, phisical address, length and
- * operation type (0 to unmap 1 for map) as a parameters.
- * Returns 0 on success, -1 on error.
- **/
-typedef int (*vfio_dma_user_func_t)(int fd, uint64_t vaddr, uint64_t iova,
-               uint64_t len, int do_map);
-
-struct vfio_iommu_type {
-       int type_id;
-       const char *name;
-       vfio_dma_user_func_t dma_user_map_func;
-       vfio_dma_func_t dma_map_func;
-};
-
-/* get the vfio container that devices are bound to by default */
-int vfio_get_default_container_fd(void);
-
-/* pick IOMMU type. returns a pointer to vfio_iommu_type or NULL for error */
-const struct vfio_iommu_type *
-vfio_set_iommu_type(int vfio_container_fd);
-
-int
-vfio_get_iommu_type(void);
-
-/* check if we have any supported extensions */
-int
-vfio_has_supported_extensions(int vfio_container_fd);
-
-int vfio_mp_sync_setup(void);
-
-#define EAL_VFIO_MP "eal_vfio_mp_sync"
-
-#define SOCKET_REQ_CONTAINER 0x100
-#define SOCKET_REQ_GROUP 0x200
-#define SOCKET_REQ_DEFAULT_CONTAINER 0x400
-#define SOCKET_REQ_IOMMU_TYPE 0x800
-#define SOCKET_OK 0x0
-#define SOCKET_NO_FD 0x1
-#define SOCKET_ERR 0xFF
-
-struct vfio_mp_param {
-       int req;
-       int result;
-       RTE_STD_C11
-       union {
-               int group_num;
-               int iommu_type_id;
-       };
-};
-
-#endif /* VFIO_PRESENT */
-
-#endif /* EAL_VFIO_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
deleted file mode 100644 (file)
index 2a47f29..0000000
+++ /dev/null
@@ -1,119 +0,0 @@
-/* SPDX-License-Identifier: BSD-3-Clause
- * Copyright(c) 2010-2018 Intel Corporation
- */
-
-#include <unistd.h>
-#include <string.h>
-
-#include <rte_compat.h>
-#include <rte_log.h>
-#include <rte_vfio.h>
-#include <rte_eal.h>
-
-#include "eal_vfio.h"
-
-/**
- * @file
- * VFIO socket for communication between primary and secondary processes.
- *
- * This file is only compiled if CONFIG_RTE_EAL_VFIO is set to "y".
- */
-
-#ifdef VFIO_PRESENT
-
-static int
-vfio_mp_primary(const struct rte_mp_msg *msg, const void *peer)
-{
-       int fd = -1;
-       int ret;
-       struct rte_mp_msg reply;
-       struct vfio_mp_param *r = (struct vfio_mp_param *)reply.param;
-       const struct vfio_mp_param *m =
-               (const struct vfio_mp_param *)msg->param;
-
-       if (msg->len_param != sizeof(*m)) {
-               RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
-               return -1;
-       }
-
-       memset(&reply, 0, sizeof(reply));
-
-       switch (m->req) {
-       case SOCKET_REQ_GROUP:
-               r->req = SOCKET_REQ_GROUP;
-               r->group_num = m->group_num;
-               fd = rte_vfio_get_group_fd(m->group_num);
-               if (fd < 0)
-                       r->result = SOCKET_ERR;
-               else if (fd == 0)
-                       /* if VFIO group exists but isn't bound to VFIO driver */
-                       r->result = SOCKET_NO_FD;
-               else {
-                       /* if group exists and is bound to VFIO driver */
-                       r->result = SOCKET_OK;
-                       reply.num_fds = 1;
-                       reply.fds[0] = fd;
-               }
-               break;
-       case SOCKET_REQ_CONTAINER:
-               r->req = SOCKET_REQ_CONTAINER;
-               fd = rte_vfio_get_container_fd();
-               if (fd < 0)
-                       r->result = SOCKET_ERR;
-               else {
-                       r->result = SOCKET_OK;
-                       reply.num_fds = 1;
-                       reply.fds[0] = fd;
-               }
-               break;
-       case SOCKET_REQ_DEFAULT_CONTAINER:
-               r->req = SOCKET_REQ_DEFAULT_CONTAINER;
-               fd = vfio_get_default_container_fd();
-               if (fd < 0)
-                       r->result = SOCKET_ERR;
-               else {
-                       r->result = SOCKET_OK;
-                       reply.num_fds = 1;
-                       reply.fds[0] = fd;
-               }
-               break;
-       case SOCKET_REQ_IOMMU_TYPE:
-       {
-               int iommu_type_id;
-
-               r->req = SOCKET_REQ_IOMMU_TYPE;
-
-               iommu_type_id = vfio_get_iommu_type();
-
-               if (iommu_type_id < 0)
-                       r->result = SOCKET_ERR;
-               else {
-                       r->iommu_type_id = iommu_type_id;
-                       r->result = SOCKET_OK;
-               }
-               break;
-       }
-       default:
-               RTE_LOG(ERR, EAL, "vfio received invalid message!\n");
-               return -1;
-       }
-
-       strcpy(reply.name, EAL_VFIO_MP);
-       reply.len_param = sizeof(*r);
-
-       ret = rte_mp_reply(&reply, peer);
-       if (m->req == SOCKET_REQ_CONTAINER && fd >= 0)
-               close(fd);
-       return ret;
-}
-
-int
-vfio_mp_sync_setup(void)
-{
-       if (rte_eal_process_type() == RTE_PROC_PRIMARY)
-               return rte_mp_action_register(EAL_VFIO_MP, vfio_mp_primary);
-
-       return 0;
-}
-
-#endif
diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
deleted file mode 100644 (file)
index 5afa087..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-/* SPDX-License-Identifier: (BSD-3-Clause OR LGPL-2.1) */
-/*
- * Copyright(c) 2007-2014 Intel Corporation.
- */
-
-#ifndef _RTE_KNI_COMMON_H_
-#define _RTE_KNI_COMMON_H_
-
-#ifdef __KERNEL__
-#include <linux/if.h>
-#include <asm/barrier.h>
-#define RTE_STD_C11
-#else
-#include <rte_common.h>
-#include <rte_config.h>
-#endif
-
-/**
- * KNI name is part of memzone name.
- */
-#define RTE_KNI_NAMESIZE 32
-
-#define RTE_CACHE_LINE_MIN_SIZE 64
-
-/*
- * Request id.
- */
-enum rte_kni_req_id {
-       RTE_KNI_REQ_UNKNOWN = 0,
-       RTE_KNI_REQ_CHANGE_MTU,
-       RTE_KNI_REQ_CFG_NETWORK_IF,
-       RTE_KNI_REQ_CHANGE_MAC_ADDR,
-       RTE_KNI_REQ_CHANGE_PROMISC,
-       RTE_KNI_REQ_MAX,
-};
-
-/*
- * Structure for KNI request.
- */
-struct rte_kni_request {
-       uint32_t req_id;             /**< Request id */
-       RTE_STD_C11
-       union {
-               uint32_t new_mtu;    /**< New MTU */
-               uint8_t if_up;       /**< 1: interface up, 0: interface down */
-               uint8_t mac_addr[6]; /**< MAC address for interface */
-               uint8_t promiscusity;/**< 1: promisc mode enable, 0: disable */
-       };
-       int32_t result;               /**< Result for processing request */
-} __attribute__((__packed__));
-
-/*
- * Fifo struct mapped in a shared memory. It describes a circular buffer FIFO
- * Write and read should wrap around. Fifo is empty when write == read
- * Writing should never overwrite the read position
- */
-struct rte_kni_fifo {
-#ifdef RTE_USE_C11_MEM_MODEL
-       unsigned write;              /**< Next position to be written*/
-       unsigned read;               /**< Next position to be read */
-#else
-       volatile unsigned write;     /**< Next position to be written*/
-       volatile unsigned read;      /**< Next position to be read */
-#endif
-       unsigned len;                /**< Circular buffer length */
-       unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
-       void *volatile buffer[];     /**< The buffer contains mbuf pointers */
-};
-
-/*
- * The kernel image of the rte_mbuf struct, with only the relevant fields.
- * Padding is necessary to assure the offsets of these fields
- */
-struct rte_kni_mbuf {
-       void *buf_addr __attribute__((__aligned__(RTE_CACHE_LINE_SIZE)));
-       uint64_t buf_physaddr;
-       uint16_t data_off;      /**< Start address of data in segment buffer. */
-       char pad1[2];
-       uint16_t nb_segs;       /**< Number of segments. */
-       char pad4[2];
-       uint64_t ol_flags;      /**< Offload features. */
-       char pad2[4];
-       uint32_t pkt_len;       /**< Total pkt len: sum of all segment data_len. */
-       uint16_t data_len;      /**< Amount of data in segment buffer. */
-
-       /* fields on second cache line */
-       char pad3[8] __attribute__((__aligned__(RTE_CACHE_LINE_MIN_SIZE)));
-       void *pool;
-       void *next;
-};
-
-/*
- * Struct used to create a KNI device. Passed to the kernel in IOCTL call
- */
-
-struct rte_kni_device_info {
-       char name[RTE_KNI_NAMESIZE];  /**< Network device name for KNI */
-
-       phys_addr_t tx_phys;
-       phys_addr_t rx_phys;
-       phys_addr_t alloc_phys;
-       phys_addr_t free_phys;
-
-       /* Used by Ethtool */
-       phys_addr_t req_phys;
-       phys_addr_t resp_phys;
-       phys_addr_t sync_phys;
-       void * sync_va;
-
-       /* mbuf mempool */
-       void * mbuf_va;
-       phys_addr_t mbuf_phys;
-
-       /* PCI info */
-       uint16_t vendor_id;           /**< Vendor ID or PCI_ANY_ID. */
-       uint16_t device_id;           /**< Device ID or PCI_ANY_ID. */
-       uint8_t bus;                  /**< Device bus */
-       uint8_t devid;                /**< Device ID */
-       uint8_t function;             /**< Device function. */
-
-       uint16_t group_id;            /**< Group ID */
-       uint32_t core_id;             /**< core ID to bind for kernel thread */
-
-       __extension__
-       uint8_t force_bind : 1;       /**< Flag for kernel thread binding */
-
-       /* mbuf size */
-       unsigned mbuf_size;
-       unsigned int mtu;
-       char mac_addr[6];
-};
-
-#define KNI_DEVICE "kni"
-
-#define RTE_KNI_IOCTL_TEST    _IOWR(0, 1, int)
-#define RTE_KNI_IOCTL_CREATE  _IOWR(0, 2, struct rte_kni_device_info)
-#define RTE_KNI_IOCTL_RELEASE _IOWR(0, 3, struct rte_kni_device_info)
-
-#endif /* _RTE_KNI_COMMON_H_ */
diff --git a/lib/librte_eal/linuxapp/eal/meson.build b/lib/librte_eal/linuxapp/eal/meson.build
deleted file mode 100644 (file)
index 7e68b2c..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: BSD-3-Clause
-# Copyright(c) 2017 Intel Corporation
-
-eal_inc += include_directories('include')
-install_subdir('include/exec-env', install_dir: get_option('includedir'))
-
-env_objs = []
-env_headers = []
-env_sources = files('eal_alarm.c',
-               'eal_cpuflags.c',
-               'eal_debug.c',
-               'eal_hugepage_info.c',
-               'eal_interrupts.c',
-               'eal_memalloc.c',
-               'eal_lcore.c',
-               'eal_log.c',
-               'eal_thread.c',
-               'eal_timer.c',
-               'eal_vfio.c',
-               'eal_vfio_mp_sync.c',
-               'eal.c',
-               'eal_memory.c',
-               'eal_dev.c',
-)
-
-deps += ['kvargs']
-if has_libnuma == 1
-       dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)
-endif
index 4ae0efc..cb8d109 100644 (file)
@@ -11,7 +11,7 @@ subdir('common') # defines common_sources, common_objs, etc.
 # The <exec-env>/eal/meson.build file should define env_sources, etc.
 if host_machine.system() == 'linux'
        dpdk_conf.set('RTE_EXEC_ENV_LINUXAPP', 1)
-       subdir('linuxapp/eal')
+       subdir('linux/eal')
 
 elif host_machine.system() == 'freebsd'
        dpdk_conf.set('RTE_EXEC_ENV_BSDAPP', 1)