eal/windows: implement basic memory management
authorDmitry Kozlyuk <dmitry.kozliuk@gmail.com>
Mon, 15 Jun 2020 00:43:54 +0000 (03:43 +0300)
committerThomas Monjalon <thomas@monjalon.net>
Mon, 15 Jun 2020 17:30:54 +0000 (19:30 +0200)
Basic memory management supports core libraries and PMDs operating in
IOVA as PA mode. It uses a kernel-mode driver, virt2phys, to obtain
IOVAs of hugepages allocated from user-mode. Multi-process mode is not
implemented and is forcefully disabled at startup. Assign myself as a
maintainer for Windows file and memory management implementation.

Signed-off-by: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
18 files changed:
MAINTAINERS
config/meson.build
doc/guides/windows_gsg/run_apps.rst
lib/librte_eal/common/meson.build
lib/librte_eal/common/rte_malloc.c
lib/librte_eal/rte_eal_exports.def
lib/librte_eal/windows/eal.c
lib/librte_eal/windows/eal_file.c [new file with mode: 0644]
lib/librte_eal/windows/eal_memalloc.c [new file with mode: 0644]
lib/librte_eal/windows/eal_memory.c [new file with mode: 0644]
lib/librte_eal/windows/eal_mp.c [new file with mode: 0644]
lib/librte_eal/windows/eal_windows.h
lib/librte_eal/windows/include/meson.build
lib/librte_eal/windows/include/rte_os.h
lib/librte_eal/windows/include/rte_virt2phys.h [new file with mode: 0644]
lib/librte_eal/windows/include/rte_windows.h
lib/librte_eal/windows/include/unistd.h
lib/librte_eal/windows/meson.build

index e8ece02..d5f42b3 100644 (file)
@@ -337,6 +337,7 @@ F: doc/guides/windows_gsg/
 Windows memory allocation
 M: Dmitry Kozlyuk <dmitry.kozliuk@gmail.com>
 F: lib/librte_eal/windows/eal_hugepages.c
+F: lib/librte_eal/windows/eal_mem*
 
 
 Core Libraries
index c1e80de..d3f05f8 100644 (file)
@@ -261,15 +261,21 @@ if is_freebsd
 endif
 
 if is_windows
-       # Minimum supported API is Windows 7.
-       add_project_arguments('-D_WIN32_WINNT=0x0601', language: 'c')
+       # VirtualAlloc2() is available since Windows 10 / Server 2016.
+       add_project_arguments('-D_WIN32_WINNT=0x0A00', language: 'c')
 
        # Use MinGW-w64 stdio, because DPDK assumes ANSI-compliant formatting.
        if cc.get_id() == 'gcc'
                add_project_arguments('-D__USE_MINGW_ANSI_STDIO', language: 'c')
        endif
 
-       add_project_link_arguments('-ladvapi32', language: 'c')
+       # Contrary to docs, VirtualAlloc2() is exported by mincore.lib
+       # in Windows SDK, while MinGW exports it by advapi32.a.
+       if is_ms_linker
+               add_project_link_arguments('-lmincore', language: 'c')
+       endif
+
+       add_project_link_arguments('-ladvapi32', '-lsetupapi', language: 'c')
 endif
 
 if get_option('b_lto')
index 9c9f42e..78e5a61 100644 (file)
@@ -27,6 +27,54 @@ See `Large-Page Support`_ in MSDN for details.
 .. _Large-Page Support: https://docs.microsoft.com/en-us/windows/win32/memory/large-page-support
 
 
+Load virt2phys Driver
+---------------------
+
+Access to physical addresses is provided by a kernel-mode driver, virt2phys.
+It is mandatory at least for using hardware PMDs, but may also be required
+for mempools.
+
+Refer to documentation in ``dpdk-kmods`` repository for details on system
+setup, driver build and installation. This driver is not signed, so signature
+checking must be disabled to load it.
+
+.. warning::
+
+    Disabling driver signature enforcement weakens OS security.
+    It is discouraged in production environments.
+
+Compiled package consists of ``virt2phys.inf``, ``virt2phys.cat``,
+and ``virt2phys.sys``. It can be installed as follows
+from Elevated Command Prompt:
+
+.. code-block:: console
+
+    pnputil /add-driver Z:\path\to\virt2phys.inf /install
+
+On Windows Server additional steps are required:
+
+1. From Device Manager, Action menu, select "Add legacy hardware".
+2. It will launch the "Add Hardware Wizard". Click "Next".
+3. Select second option "Install the hardware that I manually select
+   from a list (Advanced)".
+4. On the next screen, "Kernel bypass" will be shown as a device class.
+5. Select it, and click "Next".
+6. The previously installed drivers will now be installed for the
+   "Virtual to physical address translator" device.
+
+When loaded successfully, the driver is shown in *Device Manager* as *Virtual
+to physical address translator* device under *Kernel bypass* category.
+Installed driver persists across reboots.
+
+If DPDK is unable to communicate with the driver, a warning is printed
+on initialization (debug-level logs provide more details):
+
+.. code-block:: text
+
+    EAL: Cannot open virt2phys driver interface
+
+
+
 Run the ``helloworld`` Example
 ------------------------------
 
index 4e92081..3108442 100644 (file)
@@ -8,13 +8,24 @@ if is_windows
                'eal_common_bus.c',
                'eal_common_class.c',
                'eal_common_devargs.c',
+               'eal_common_dynmem.c',
                'eal_common_errno.c',
+               'eal_common_fbarray.c',
                'eal_common_launch.c',
                'eal_common_lcore.c',
                'eal_common_log.c',
+               'eal_common_mcfg.c',
+               'eal_common_memalloc.c',
+               'eal_common_memory.c',
+               'eal_common_memzone.c',
                'eal_common_options.c',
+               'eal_common_string_fns.c',
+               'eal_common_tailqs.c',
                'eal_common_thread.c',
                'eal_common_trace_points.c',
+               'malloc_elem.c',
+               'malloc_heap.c',
+               'rte_malloc.c',
        )
        subdir_done()
 endif
index f1b7316..9d39e58 100644 (file)
@@ -20,6 +20,7 @@
 #include <rte_lcore.h>
 #include <rte_common.h>
 #include <rte_spinlock.h>
+
 #include <rte_eal_trace.h>
 
 #include <rte_malloc.h>
index c1bdee1..e2eb24f 100644 (file)
@@ -1,8 +1,128 @@
 EXPORTS
        __rte_panic
+       rte_calloc
+       rte_calloc_socket
+       rte_eal_get_configuration
+       rte_eal_has_hugepages
        rte_eal_init
+       rte_eal_iova_mode
        rte_eal_mp_remote_launch
        rte_eal_mp_wait_lcore
+       rte_eal_process_type
        rte_eal_remote_launch
        rte_log
+       rte_eal_tailq_lookup
+       rte_eal_tailq_register
+       rte_eal_using_phys_addrs
+       rte_free
+       rte_malloc
+       rte_malloc_dump_stats
+       rte_malloc_get_socket_stats
+       rte_malloc_set_limit
+       rte_malloc_socket
+       rte_malloc_validate
+       rte_malloc_virt2iova
+       rte_mcfg_mem_read_lock
+       rte_mcfg_mem_read_unlock
+       rte_mcfg_mem_write_lock
+       rte_mcfg_mem_write_unlock
+       rte_mcfg_mempool_read_lock
+       rte_mcfg_mempool_read_unlock
+       rte_mcfg_mempool_write_lock
+       rte_mcfg_mempool_write_unlock
+       rte_mcfg_tailq_read_lock
+       rte_mcfg_tailq_read_unlock
+       rte_mcfg_tailq_write_lock
+       rte_mcfg_tailq_write_unlock
+       rte_mem_lock_page
+       rte_mem_virt2iova
+       rte_mem_virt2phy
+       rte_memory_get_nchannel
+       rte_memory_get_nrank
+       rte_memzone_dump
+       rte_memzone_free
+       rte_memzone_lookup
+       rte_memzone_reserve
+       rte_memzone_reserve_aligned
+       rte_memzone_reserve_bounded
+       rte_memzone_walk
        rte_vlog
+       rte_realloc
+       rte_zmalloc
+       rte_zmalloc_socket
+
+       rte_mp_action_register
+       rte_mp_action_unregister
+       rte_mp_reply
+       rte_mp_sendmsg
+
+       rte_fbarray_attach
+       rte_fbarray_destroy
+       rte_fbarray_detach
+       rte_fbarray_dump_metadata
+       rte_fbarray_find_contig_free
+       rte_fbarray_find_contig_used
+       rte_fbarray_find_idx
+       rte_fbarray_find_next_free
+       rte_fbarray_find_next_n_free
+       rte_fbarray_find_next_n_used
+       rte_fbarray_find_next_used
+       rte_fbarray_get
+       rte_fbarray_init
+       rte_fbarray_is_used
+       rte_fbarray_set_free
+       rte_fbarray_set_used
+       rte_malloc_dump_heaps
+       rte_mem_alloc_validator_register
+       rte_mem_alloc_validator_unregister
+       rte_mem_check_dma_mask
+       rte_mem_event_callback_register
+       rte_mem_event_callback_unregister
+       rte_mem_iova2virt
+       rte_mem_virt2memseg
+       rte_mem_virt2memseg_list
+       rte_memseg_contig_walk
+       rte_memseg_list_walk
+       rte_memseg_walk
+       rte_mp_request_async
+       rte_mp_request_sync
+
+       rte_fbarray_find_prev_free
+       rte_fbarray_find_prev_n_free
+       rte_fbarray_find_prev_n_used
+       rte_fbarray_find_prev_used
+       rte_fbarray_find_rev_contig_free
+       rte_fbarray_find_rev_contig_used
+       rte_memseg_contig_walk_thread_unsafe
+       rte_memseg_list_walk_thread_unsafe
+       rte_memseg_walk_thread_unsafe
+
+       rte_malloc_heap_create
+       rte_malloc_heap_destroy
+       rte_malloc_heap_get_socket
+       rte_malloc_heap_memory_add
+       rte_malloc_heap_memory_attach
+       rte_malloc_heap_memory_detach
+       rte_malloc_heap_memory_remove
+       rte_malloc_heap_socket_is_external
+       rte_mem_check_dma_mask_thread_unsafe
+       rte_mem_set_dma_mask
+       rte_memseg_get_fd
+       rte_memseg_get_fd_offset
+       rte_memseg_get_fd_offset_thread_unsafe
+       rte_memseg_get_fd_thread_unsafe
+
+       rte_extmem_attach
+       rte_extmem_detach
+       rte_extmem_register
+       rte_extmem_unregister
+
+       rte_fbarray_find_biggest_free
+       rte_fbarray_find_biggest_used
+       rte_fbarray_find_rev_biggest_free
+       rte_fbarray_find_rev_biggest_used
+
+       rte_mem_lock
+       rte_mem_map
+       rte_mem_page_size
+       rte_mem_unmap
index c371c36..427a555 100644 (file)
@@ -94,6 +94,24 @@ eal_proc_type_detect(void)
        return ptype;
 }
 
+enum rte_proc_type_t
+rte_eal_process_type(void)
+{
+       return rte_config.process_type;
+}
+
+int
+rte_eal_has_hugepages(void)
+{
+       return !internal_config.no_hugetlbfs;
+}
+
+enum rte_iova_mode
+rte_eal_iova_mode(void)
+{
+       return rte_config.iova_mode;
+}
+
 /* display usage */
 static void
 eal_usage(const char *prgname)
@@ -256,7 +274,7 @@ __rte_trace_point_register(rte_trace_point_t *trace, const char *name,
        return -ENOTSUP;
 }
 
-/* Launch threads, called at application init(). */
+ /* Launch threads, called at application init(). */
 int
 rte_eal_init(int argc, char **argv)
 {
@@ -282,6 +300,13 @@ rte_eal_init(int argc, char **argv)
        if (fctret < 0)
                exit(1);
 
+       /* Prevent creation of shared memory files. */
+       if (internal_config.in_memory == 0) {
+               RTE_LOG(WARNING, EAL, "Multi-process support is requested, "
+                       "but not available.\n");
+               internal_config.in_memory = 1;
+       }
+
        if (!internal_config.no_hugetlbfs && (eal_hugepage_info_init() < 0)) {
                rte_eal_init_alert("Cannot get hugepage information");
                rte_errno = EACCES;
@@ -293,6 +318,42 @@ rte_eal_init(int argc, char **argv)
                        internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
        }
 
+       if (eal_mem_win32api_init() < 0) {
+               rte_eal_init_alert("Cannot access Win32 memory management");
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       if (eal_mem_virt2iova_init() < 0) {
+               /* Non-fatal error if physical addresses are not required. */
+               RTE_LOG(WARNING, EAL, "Cannot access virt2phys driver, "
+                       "PA will not be available\n");
+       }
+
+       if (rte_eal_memzone_init() < 0) {
+               rte_eal_init_alert("Cannot init memzone");
+               rte_errno = ENODEV;
+               return -1;
+       }
+
+       if (rte_eal_memory_init() < 0) {
+               rte_eal_init_alert("Cannot init memory");
+               rte_errno = ENOMEM;
+               return -1;
+       }
+
+       if (rte_eal_malloc_heap_init() < 0) {
+               rte_eal_init_alert("Cannot init malloc heap");
+               rte_errno = ENODEV;
+               return -1;
+       }
+
+       if (rte_eal_tailqs_init() < 0) {
+               rte_eal_init_alert("Cannot init tail queues for objects");
+               rte_errno = EFAULT;
+               return -1;
+       }
+
        eal_thread_init_master(rte_config.master_lcore);
 
        RTE_LCORE_FOREACH_SLAVE(i) {
diff --git a/lib/librte_eal/windows/eal_file.c b/lib/librte_eal/windows/eal_file.c
new file mode 100644 (file)
index 0000000..dfbe8d3
--- /dev/null
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Dmitry Kozlyuk
+ */
+
+#include <fcntl.h>
+#include <io.h>
+#include <share.h>
+#include <sys/stat.h>
+
+#include "eal_private.h"
+#include "eal_windows.h"
+
+int
+eal_file_open(const char *path, int flags)
+{
+       static const int MODE_MASK = EAL_OPEN_READONLY | EAL_OPEN_READWRITE;
+
+       int fd, ret, sys_flags;
+
+       switch (flags & MODE_MASK) {
+       case EAL_OPEN_READONLY:
+               sys_flags = _O_RDONLY;
+               break;
+       case EAL_OPEN_READWRITE:
+               sys_flags = _O_RDWR;
+               break;
+       default:
+               rte_errno = ENOTSUP;
+               return -1;
+       }
+
+       if (flags & EAL_OPEN_CREATE)
+               sys_flags |= _O_CREAT;
+
+       ret = _sopen_s(&fd, path, sys_flags, _SH_DENYNO, _S_IWRITE);
+       if (ret < 0) {
+               rte_errno = errno;
+               return -1;
+       }
+
+       return fd;
+}
+
+int
+eal_file_truncate(int fd, ssize_t size)
+{
+       HANDLE handle;
+       DWORD ret;
+       LONG low = (LONG)((size_t)size);
+       LONG high = (LONG)((size_t)size >> 32);
+
+       handle = (HANDLE)_get_osfhandle(fd);
+       if (handle == INVALID_HANDLE_VALUE) {
+               rte_errno = EBADF;
+               return -1;
+       }
+
+       ret = SetFilePointer(handle, low, &high, FILE_BEGIN);
+       if (ret == INVALID_SET_FILE_POINTER) {
+               RTE_LOG_WIN32_ERR("SetFilePointer()");
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+lock_file(HANDLE handle, enum eal_flock_op op, enum eal_flock_mode mode)
+{
+       DWORD sys_flags = 0;
+       OVERLAPPED overlapped;
+
+       if (op == EAL_FLOCK_EXCLUSIVE)
+               sys_flags |= LOCKFILE_EXCLUSIVE_LOCK;
+       if (mode == EAL_FLOCK_RETURN)
+               sys_flags |= LOCKFILE_FAIL_IMMEDIATELY;
+
+       memset(&overlapped, 0, sizeof(overlapped));
+       if (!LockFileEx(handle, sys_flags, 0, 0, 0, &overlapped)) {
+               if ((sys_flags & LOCKFILE_FAIL_IMMEDIATELY) &&
+                       (GetLastError() == ERROR_IO_PENDING)) {
+                       rte_errno = EWOULDBLOCK;
+               } else {
+                       RTE_LOG_WIN32_ERR("LockFileEx()");
+                       rte_errno = EINVAL;
+               }
+               return -1;
+       }
+
+       return 0;
+}
+
+static int
+unlock_file(HANDLE handle)
+{
+       if (!UnlockFileEx(handle, 0, 0, 0, NULL)) {
+               RTE_LOG_WIN32_ERR("UnlockFileEx()");
+               rte_errno = EINVAL;
+               return -1;
+       }
+       return 0;
+}
+
+int
+eal_file_lock(int fd, enum eal_flock_op op, enum eal_flock_mode mode)
+{
+       HANDLE handle = (HANDLE)_get_osfhandle(fd);
+
+       if (handle == INVALID_HANDLE_VALUE) {
+               rte_errno = EBADF;
+               return -1;
+       }
+
+       switch (op) {
+       case EAL_FLOCK_EXCLUSIVE:
+       case EAL_FLOCK_SHARED:
+               return lock_file(handle, op, mode);
+       case EAL_FLOCK_UNLOCK:
+               return unlock_file(handle);
+       default:
+               rte_errno = EINVAL;
+               return -1;
+       }
+}
diff --git a/lib/librte_eal/windows/eal_memalloc.c b/lib/librte_eal/windows/eal_memalloc.c
new file mode 100644 (file)
index 0000000..a7452b6
--- /dev/null
@@ -0,0 +1,441 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+#include <rte_errno.h>
+#include <rte_os.h>
+
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_private.h"
+#include "eal_windows.h"
+
+int
+eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
+{
+       /* Hugepages have no associated files in Windows. */
+       RTE_SET_USED(list_idx);
+       RTE_SET_USED(seg_idx);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset)
+{
+       /* Hugepages have no associated files in Windows. */
+       RTE_SET_USED(list_idx);
+       RTE_SET_USED(seg_idx);
+       RTE_SET_USED(offset);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+static int
+alloc_seg(struct rte_memseg *ms, void *requested_addr, int socket_id,
+       struct hugepage_info *hi)
+{
+       HANDLE current_process;
+       unsigned int numa_node;
+       size_t alloc_sz;
+       void *addr;
+       rte_iova_t iova = RTE_BAD_IOVA;
+       PSAPI_WORKING_SET_EX_INFORMATION info;
+       PSAPI_WORKING_SET_EX_BLOCK *page;
+
+       if (ms->len > 0) {
+               /* If a segment is already allocated as needed, return it. */
+               if ((ms->addr == requested_addr) &&
+                       (ms->socket_id == socket_id) &&
+                       (ms->hugepage_sz == hi->hugepage_sz)) {
+                       return 0;
+               }
+
+               /* Bugcheck, should not happen. */
+               RTE_LOG(DEBUG, EAL, "Attempted to reallocate segment %p "
+                       "(size %zu) on socket %d", ms->addr,
+                       ms->len, ms->socket_id);
+               return -1;
+       }
+
+       current_process = GetCurrentProcess();
+       numa_node = eal_socket_numa_node(socket_id);
+       alloc_sz = hi->hugepage_sz;
+
+       if (requested_addr == NULL) {
+               /* Request a new chunk of memory from OS. */
+               addr = eal_mem_alloc_socket(alloc_sz, socket_id);
+               if (addr == NULL) {
+                       RTE_LOG(DEBUG, EAL, "Cannot allocate %zu bytes "
+                               "on socket %d\n", alloc_sz, socket_id);
+                       return -1;
+               }
+       } else {
+               /* Requested address is already reserved, commit memory. */
+               addr = eal_mem_commit(requested_addr, alloc_sz, socket_id);
+
+               /* During commitment, memory is temporary freed and might
+                * be allocated by different non-EAL thread. This is a fatal
+                * error, because it breaks MSL assumptions.
+                */
+               if ((addr != NULL) && (addr != requested_addr)) {
+                       RTE_LOG(CRIT, EAL, "Address %p occupied by an alien "
+                               " allocation - MSL is not VA-contiguous!\n",
+                               requested_addr);
+                       return -1;
+               }
+
+               if (addr == NULL) {
+                       RTE_LOG(DEBUG, EAL, "Cannot commit reserved memory %p "
+                               "(size %zu) on socket %d\n",
+                               requested_addr, alloc_sz, socket_id);
+                       return -1;
+               }
+       }
+
+       /* Force OS to allocate a physical page and select a NUMA node.
+        * Hugepages are not pageable in Windows, so there's no race
+        * for physical address.
+        */
+       *(volatile int *)addr = *(volatile int *)addr;
+
+       /* Only try to obtain IOVA if it's available, so that applications
+        * that do not need IOVA can use this allocator.
+        */
+       if (rte_eal_using_phys_addrs()) {
+               iova = rte_mem_virt2iova(addr);
+               if (iova == RTE_BAD_IOVA) {
+                       RTE_LOG(DEBUG, EAL,
+                               "Cannot get IOVA of allocated segment\n");
+                       goto error;
+               }
+       }
+
+       /* Only "Ex" function can handle hugepages. */
+       info.VirtualAddress = addr;
+       if (!QueryWorkingSetEx(current_process, &info, sizeof(info))) {
+               RTE_LOG_WIN32_ERR("QueryWorkingSetEx(%p)", addr);
+               goto error;
+       }
+
+       page = &info.VirtualAttributes;
+       if (!page->Valid || !page->LargePage) {
+               RTE_LOG(DEBUG, EAL, "Got regular page instead of a hugepage\n");
+               goto error;
+       }
+       if (page->Node != numa_node) {
+               RTE_LOG(DEBUG, EAL,
+                       "NUMA node hint %u (socket %d) not respected, got %u\n",
+                       numa_node, socket_id, page->Node);
+               goto error;
+       }
+
+       ms->addr = addr;
+       ms->hugepage_sz = hi->hugepage_sz;
+       ms->len = alloc_sz;
+       ms->nchannel = rte_memory_get_nchannel();
+       ms->nrank = rte_memory_get_nrank();
+       ms->iova = iova;
+       ms->socket_id = socket_id;
+
+       return 0;
+
+error:
+       /* Only jump here when `addr` and `alloc_sz` are valid. */
+       if (eal_mem_decommit(addr, alloc_sz) && (rte_errno == EADDRNOTAVAIL)) {
+               /* During decommitment, memory is temporarily returned
+                * to the system and the address may become unavailable.
+                */
+               RTE_LOG(CRIT, EAL, "Address %p occupied by an alien "
+                       " allocation - MSL is not VA-contiguous!\n", addr);
+       }
+       return -1;
+}
+
+static int
+free_seg(struct rte_memseg *ms)
+{
+       if (eal_mem_decommit(ms->addr, ms->len)) {
+               if (rte_errno == EADDRNOTAVAIL) {
+                       /* See alloc_seg() for explanation. */
+                       RTE_LOG(CRIT, EAL, "Address %p occupied by an alien "
+                               " allocation - MSL is not VA-contiguous!\n",
+                               ms->addr);
+               }
+               return -1;
+       }
+
+       /* Must clear the segment, because alloc_seg() inspects it. */
+       memset(ms, 0, sizeof(*ms));
+       return 0;
+}
+
+struct alloc_walk_param {
+       struct hugepage_info *hi;
+       struct rte_memseg **ms;
+       size_t page_sz;
+       unsigned int segs_allocated;
+       unsigned int n_segs;
+       int socket;
+       bool exact;
+};
+
+static int
+alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct alloc_walk_param *wa = arg;
+       struct rte_memseg_list *cur_msl;
+       size_t page_sz;
+       int cur_idx, start_idx, j;
+       unsigned int msl_idx, need, i;
+
+       if (msl->page_sz != wa->page_sz)
+               return 0;
+       if (msl->socket_id != wa->socket)
+               return 0;
+
+       page_sz = (size_t)msl->page_sz;
+
+       msl_idx = msl - mcfg->memsegs;
+       cur_msl = &mcfg->memsegs[msl_idx];
+
+       need = wa->n_segs;
+
+       /* try finding space in memseg list */
+       if (wa->exact) {
+               /* if we require exact number of pages in a list, find them */
+               cur_idx = rte_fbarray_find_next_n_free(
+                       &cur_msl->memseg_arr, 0, need);
+               if (cur_idx < 0)
+                       return 0;
+               start_idx = cur_idx;
+       } else {
+               int cur_len;
+
+               /* we don't require exact number of pages, so we're going to go
+                * for best-effort allocation. that means finding the biggest
+                * unused block, and going with that.
+                */
+               cur_idx = rte_fbarray_find_biggest_free(
+                       &cur_msl->memseg_arr, 0);
+               if (cur_idx < 0)
+                       return 0;
+               start_idx = cur_idx;
+               /* adjust the size to possibly be smaller than original
+                * request, but do not allow it to be bigger.
+                */
+               cur_len = rte_fbarray_find_contig_free(
+                       &cur_msl->memseg_arr, cur_idx);
+               need = RTE_MIN(need, (unsigned int)cur_len);
+       }
+
+       for (i = 0; i < need; i++, cur_idx++) {
+               struct rte_memseg *cur;
+               void *map_addr;
+
+               cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
+               map_addr = RTE_PTR_ADD(cur_msl->base_va, cur_idx * page_sz);
+
+               if (alloc_seg(cur, map_addr, wa->socket, wa->hi)) {
+                       RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, "
+                               "but only %i were allocated\n", need, i);
+
+                       /* if exact number wasn't requested, stop */
+                       if (!wa->exact)
+                               goto out;
+
+                       /* clean up */
+                       for (j = start_idx; j < cur_idx; j++) {
+                               struct rte_memseg *tmp;
+                               struct rte_fbarray *arr = &cur_msl->memseg_arr;
+
+                               tmp = rte_fbarray_get(arr, j);
+                               rte_fbarray_set_free(arr, j);
+
+                               if (free_seg(tmp))
+                                       RTE_LOG(DEBUG, EAL, "Cannot free page\n");
+                       }
+                       /* clear the list */
+                       if (wa->ms)
+                               memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
+
+                       return -1;
+               }
+               if (wa->ms)
+                       wa->ms[i] = cur;
+
+               rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
+       }
+
+out:
+       wa->segs_allocated = i;
+       if (i > 0)
+               cur_msl->version++;
+
+       /* if we didn't allocate any segments, move on to the next list */
+       return i > 0;
+}
+
+struct free_walk_param {
+       struct hugepage_info *hi;
+       struct rte_memseg *ms;
+};
+static int
+free_seg_walk(const struct rte_memseg_list *msl, void *arg)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *found_msl;
+       struct free_walk_param *wa = arg;
+       uintptr_t start_addr, end_addr;
+       int msl_idx, seg_idx, ret;
+
+       start_addr = (uintptr_t) msl->base_va;
+       end_addr = start_addr + msl->len;
+
+       if ((uintptr_t)wa->ms->addr < start_addr ||
+               (uintptr_t)wa->ms->addr >= end_addr)
+               return 0;
+
+       msl_idx = msl - mcfg->memsegs;
+       seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
+
+       /* msl is const */
+       found_msl = &mcfg->memsegs[msl_idx];
+       found_msl->version++;
+
+       rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
+
+       ret = free_seg(wa->ms);
+
+       return (ret < 0) ? (-1) : 1;
+}
+
+int
+eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs,
+               size_t page_sz, int socket, bool exact)
+{
+       unsigned int i;
+       int ret = -1;
+       struct alloc_walk_param wa;
+       struct hugepage_info *hi = NULL;
+
+       if (internal_config.legacy_mem) {
+               RTE_LOG(ERR, EAL, "dynamic allocation not supported in legacy mode\n");
+               return -ENOTSUP;
+       }
+
+       for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+               struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+               if (page_sz == hpi->hugepage_sz) {
+                       hi = hpi;
+                       break;
+               }
+       }
+       if (!hi) {
+               RTE_LOG(ERR, EAL, "cannot find relevant hugepage_info entry\n");
+               return -1;
+       }
+
+       memset(&wa, 0, sizeof(wa));
+       wa.exact = exact;
+       wa.hi = hi;
+       wa.ms = ms;
+       wa.n_segs = n_segs;
+       wa.page_sz = page_sz;
+       wa.socket = socket;
+       wa.segs_allocated = 0;
+
+       /* memalloc is locked, so it's safe to use thread-unsafe version */
+       ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
+       if (ret == 0) {
+               RTE_LOG(ERR, EAL, "cannot find suitable memseg_list\n");
+               ret = -1;
+       } else if (ret > 0) {
+               ret = (int)wa.segs_allocated;
+       }
+
+       return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_seg(size_t page_sz, int socket)
+{
+       struct rte_memseg *ms = NULL;
+       eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true);
+       return ms;
+}
+
+int
+eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
+{
+       int seg, ret = 0;
+
+       /* dynamic free not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       for (seg = 0; seg < n_segs; seg++) {
+               struct rte_memseg *cur = ms[seg];
+               struct hugepage_info *hi = NULL;
+               struct free_walk_param wa;
+               size_t i;
+               int walk_res;
+
+               /* if this page is marked as unfreeable, fail */
+               if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
+                       RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
+                       ret = -1;
+                       continue;
+               }
+
+               memset(&wa, 0, sizeof(wa));
+
+               for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
+                       hi = &internal_config.hugepage_info[i];
+                       if (cur->hugepage_sz == hi->hugepage_sz)
+                               break;
+               }
+               if (i == RTE_DIM(internal_config.hugepage_info)) {
+                       RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
+                       ret = -1;
+                       continue;
+               }
+
+               wa.ms = cur;
+               wa.hi = hi;
+
+               /* memalloc is locked, so it's safe to use thread-unsafe version
+                */
+               walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
+                               &wa);
+               if (walk_res == 1)
+                       continue;
+               if (walk_res == 0)
+                       RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
+               ret = -1;
+       }
+       return ret;
+}
+
+int
+eal_memalloc_free_seg(struct rte_memseg *ms)
+{
+       return eal_memalloc_free_seg_bulk(&ms, 1);
+}
+
+int
+eal_memalloc_sync_with_primary(void)
+{
+       /* No multi-process support. */
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+eal_memalloc_init(void)
+{
+       /* No action required. */
+       return 0;
+}
diff --git a/lib/librte_eal/windows/eal_memory.c b/lib/librte_eal/windows/eal_memory.c
new file mode 100644 (file)
index 0000000..73be1cf
--- /dev/null
@@ -0,0 +1,710 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+#include <inttypes.h>
+#include <io.h>
+
+#include <rte_eal_paging.h>
+#include <rte_errno.h>
+
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+#include "eal_memcfg.h"
+#include "eal_options.h"
+#include "eal_private.h"
+#include "eal_windows.h"
+
+#include <rte_virt2phys.h>
+
+/* MinGW-w64 headers lack VirtualAlloc2() in some distributions.
+ * Provide a copy of definitions and code to load it dynamically.
+ * Note: definitions are copied verbatim from Microsoft documentation
+ * and don't follow DPDK code style.
+ *
+ * MEM_RESERVE_PLACEHOLDER being defined means VirtualAlloc2() is present too.
+ */
+#ifndef MEM_PRESERVE_PLACEHOLDER
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/winnt/ne-winnt-mem_extended_parameter_type */
+typedef enum MEM_EXTENDED_PARAMETER_TYPE {
+       MemExtendedParameterInvalidType,
+       MemExtendedParameterAddressRequirements,
+       MemExtendedParameterNumaNode,
+       MemExtendedParameterPartitionHandle,
+       MemExtendedParameterUserPhysicalHandle,
+       MemExtendedParameterAttributeFlags,
+       MemExtendedParameterMax
+} *PMEM_EXTENDED_PARAMETER_TYPE;
+
+#define MEM_EXTENDED_PARAMETER_TYPE_BITS 4
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-mem_extended_parameter */
+typedef struct MEM_EXTENDED_PARAMETER {
+       struct {
+               DWORD64 Type : MEM_EXTENDED_PARAMETER_TYPE_BITS;
+               DWORD64 Reserved : 64 - MEM_EXTENDED_PARAMETER_TYPE_BITS;
+       } DUMMYSTRUCTNAME;
+       union {
+               DWORD64 ULong64;
+               PVOID   Pointer;
+               SIZE_T  Size;
+               HANDLE  Handle;
+               DWORD   ULong;
+       } DUMMYUNIONNAME;
+} MEM_EXTENDED_PARAMETER, *PMEM_EXTENDED_PARAMETER;
+
+/* https://docs.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-virtualalloc2 */
+typedef PVOID (*VirtualAlloc2_type)(
+       HANDLE                 Process,
+       PVOID                  BaseAddress,
+       SIZE_T                 Size,
+       ULONG                  AllocationType,
+       ULONG                  PageProtection,
+       MEM_EXTENDED_PARAMETER *ExtendedParameters,
+       ULONG                  ParameterCount
+);
+
+/* VirtualAlloc2() flags. */
+#define MEM_COALESCE_PLACEHOLDERS 0x00000001
+#define MEM_PRESERVE_PLACEHOLDER  0x00000002
+#define MEM_REPLACE_PLACEHOLDER   0x00004000
+#define MEM_RESERVE_PLACEHOLDER   0x00040000
+
+/* Named exactly as the function, so that user code does not depend
+ * on it being found at compile time or dynamically.
+ */
+static VirtualAlloc2_type VirtualAlloc2;
+
+int
+eal_mem_win32api_init(void)
+{
+       /* Contrary to the docs, VirtualAlloc2() is not in kernel32.dll,
+        * see https://github.com/MicrosoftDocs/feedback/issues/1129.
+        */
+       static const char library_name[] = "kernelbase.dll";
+       static const char function[] = "VirtualAlloc2";
+
+       HMODULE library = NULL;
+       int ret = 0;
+
+       /* Already done. */
+       if (VirtualAlloc2 != NULL)
+               return 0;
+
+       library = LoadLibraryA(library_name);
+       if (library == NULL) {
+               RTE_LOG_WIN32_ERR("LoadLibraryA(\"%s\")", library_name);
+               return -1;
+       }
+
+       VirtualAlloc2 = (VirtualAlloc2_type)(
+               (void *)GetProcAddress(library, function));
+       if (VirtualAlloc2 == NULL) {
+               RTE_LOG_WIN32_ERR("GetProcAddress(\"%s\", \"%s\")\n",
+                       library_name, function);
+
+               /* Contrary to the docs, Server 2016 is not supported. */
+               RTE_LOG(ERR, EAL, "Windows 10 or Windows Server 2019 "
+                       " is required for memory management\n");
+               ret = -1;
+       }
+
+       FreeLibrary(library);
+
+       return ret;
+}
+
+#else
+
+/* Stub in case VirtualAlloc2() is provided by the compiler. */
+int
+eal_mem_win32api_init(void)
+{
+       return 0;
+}
+
+#endif /* defined(MEM_RESERVE_PLACEHOLDER) */
+
+static HANDLE virt2phys_device = INVALID_HANDLE_VALUE;
+
+int
+eal_mem_virt2iova_init(void)
+{
+       HDEVINFO list = INVALID_HANDLE_VALUE;
+       SP_DEVICE_INTERFACE_DATA ifdata;
+       SP_DEVICE_INTERFACE_DETAIL_DATA *detail = NULL;
+       DWORD detail_size;
+       int ret = -1;
+
+       list = SetupDiGetClassDevs(
+               &GUID_DEVINTERFACE_VIRT2PHYS, NULL, NULL,
+               DIGCF_DEVICEINTERFACE | DIGCF_PRESENT);
+       if (list == INVALID_HANDLE_VALUE) {
+               RTE_LOG_WIN32_ERR("SetupDiGetClassDevs()");
+               goto exit;
+       }
+
+       ifdata.cbSize = sizeof(ifdata);
+       if (!SetupDiEnumDeviceInterfaces(
+               list, NULL, &GUID_DEVINTERFACE_VIRT2PHYS, 0, &ifdata)) {
+               RTE_LOG_WIN32_ERR("SetupDiEnumDeviceInterfaces()");
+               goto exit;
+       }
+
+       if (!SetupDiGetDeviceInterfaceDetail(
+               list, &ifdata, NULL, 0, &detail_size, NULL)) {
+               if (GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+                       RTE_LOG_WIN32_ERR(
+                               "SetupDiGetDeviceInterfaceDetail(probe)");
+                       goto exit;
+               }
+       }
+
+       detail = malloc(detail_size);
+       if (detail == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot allocate virt2phys "
+                       "device interface detail data\n");
+               goto exit;
+       }
+
+       detail->cbSize = sizeof(*detail);
+       if (!SetupDiGetDeviceInterfaceDetail(
+               list, &ifdata, detail, detail_size, NULL, NULL)) {
+               RTE_LOG_WIN32_ERR("SetupDiGetDeviceInterfaceDetail(read)");
+               goto exit;
+       }
+
+       RTE_LOG(DEBUG, EAL, "Found virt2phys device: %s\n", detail->DevicePath);
+
+       virt2phys_device = CreateFile(
+               detail->DevicePath, 0, 0, NULL, OPEN_EXISTING, 0, NULL);
+       if (virt2phys_device == INVALID_HANDLE_VALUE) {
+               RTE_LOG_WIN32_ERR("CreateFile()");
+               goto exit;
+       }
+
+       /* Indicate success. */
+       ret = 0;
+
+exit:
+       if (detail != NULL)
+               free(detail);
+       if (list != INVALID_HANDLE_VALUE)
+               SetupDiDestroyDeviceInfoList(list);
+
+       return ret;
+}
+
+phys_addr_t
+rte_mem_virt2phy(const void *virt)
+{
+       LARGE_INTEGER phys;
+       DWORD bytes_returned;
+
+       if (virt2phys_device == INVALID_HANDLE_VALUE)
+               return RTE_BAD_PHYS_ADDR;
+
+       if (!DeviceIoControl(
+                       virt2phys_device, IOCTL_VIRT2PHYS_TRANSLATE,
+                       &virt, sizeof(virt), &phys, sizeof(phys),
+                       &bytes_returned, NULL)) {
+               RTE_LOG_WIN32_ERR("DeviceIoControl(IOCTL_VIRT2PHYS_TRANSLATE)");
+               return RTE_BAD_PHYS_ADDR;
+       }
+
+       return phys.QuadPart;
+}
+
+/* Windows currently only supports IOVA as PA. */
+rte_iova_t
+rte_mem_virt2iova(const void *virt)
+{
+       phys_addr_t phys;
+
+       if (virt2phys_device == INVALID_HANDLE_VALUE)
+               return RTE_BAD_IOVA;
+
+       phys = rte_mem_virt2phy(virt);
+       if (phys == RTE_BAD_PHYS_ADDR)
+               return RTE_BAD_IOVA;
+
+       return (rte_iova_t)phys;
+}
+
+/* Always using physical addresses under Windows if they can be obtained. */
+int
+rte_eal_using_phys_addrs(void)
+{
+       return virt2phys_device != INVALID_HANDLE_VALUE;
+}
+
+/* Approximate error mapping from VirtualAlloc2() to POSIX mmap(3). */
+static void
+set_errno_from_win32_alloc_error(DWORD code)
+{
+       switch (code) {
+       case ERROR_SUCCESS:
+               rte_errno = 0;
+               break;
+
+       case ERROR_INVALID_ADDRESS:
+               /* A valid requested address is not available. */
+       case ERROR_COMMITMENT_LIMIT:
+               /* May occur when committing regular memory. */
+       case ERROR_NO_SYSTEM_RESOURCES:
+               /* Occurs when the system runs out of hugepages. */
+               rte_errno = ENOMEM;
+               break;
+
+       case ERROR_INVALID_PARAMETER:
+       default:
+               rte_errno = EINVAL;
+               break;
+       }
+}
+
+void *
+eal_mem_reserve(void *requested_addr, size_t size, int flags)
+{
+       HANDLE process;
+       void *virt;
+
+       /* Windows requires hugepages to be committed. */
+       if (flags & EAL_RESERVE_HUGEPAGES) {
+               rte_errno = ENOTSUP;
+               return NULL;
+       }
+
+       process = GetCurrentProcess();
+
+       virt = VirtualAlloc2(process, requested_addr, size,
+               MEM_RESERVE | MEM_RESERVE_PLACEHOLDER, PAGE_NOACCESS,
+               NULL, 0);
+       if (virt == NULL) {
+               DWORD err = GetLastError();
+               RTE_LOG_WIN32_ERR("VirtualAlloc2()");
+               set_errno_from_win32_alloc_error(err);
+               return NULL;
+       }
+
+       if ((flags & EAL_RESERVE_FORCE_ADDRESS) && (virt != requested_addr)) {
+               if (!VirtualFreeEx(process, virt, 0, MEM_RELEASE))
+                       RTE_LOG_WIN32_ERR("VirtualFreeEx()");
+               rte_errno = ENOMEM;
+               return NULL;
+       }
+
+       return virt;
+}
+
+void *
+eal_mem_alloc_socket(size_t size, int socket_id)
+{
+       DWORD flags = MEM_RESERVE | MEM_COMMIT;
+       void *addr;
+
+       flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+       addr = VirtualAllocExNuma(GetCurrentProcess(), NULL, size, flags,
+               PAGE_READWRITE, eal_socket_numa_node(socket_id));
+       if (addr == NULL)
+               rte_errno = ENOMEM;
+       return addr;
+}
+
+void *
+eal_mem_commit(void *requested_addr, size_t size, int socket_id)
+{
+       HANDLE process;
+       MEM_EXTENDED_PARAMETER param;
+       DWORD param_count = 0;
+       DWORD flags;
+       void *addr;
+
+       process = GetCurrentProcess();
+
+       if (requested_addr != NULL) {
+               MEMORY_BASIC_INFORMATION info;
+
+               if (VirtualQueryEx(process, requested_addr, &info,
+                               sizeof(info)) != sizeof(info)) {
+                       RTE_LOG_WIN32_ERR("VirtualQuery(%p)", requested_addr);
+                       return NULL;
+               }
+
+               /* Split reserved region if only a part is committed. */
+               flags = MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER;
+               if ((info.RegionSize > size) && !VirtualFreeEx(
+                               process, requested_addr, size, flags)) {
+                       RTE_LOG_WIN32_ERR(
+                               "VirtualFreeEx(%p, %zu, preserve placeholder)",
+                               requested_addr, size);
+                       return NULL;
+               }
+
+               /* Temporarily release the region to be committed.
+                *
+                * There is an inherent race for this memory range
+                * if another thread allocates memory via OS API.
+                * However, VirtualAlloc2(MEM_REPLACE_PLACEHOLDER)
+                * doesn't work with MEM_LARGE_PAGES on Windows Server.
+                */
+               if (!VirtualFreeEx(process, requested_addr, 0, MEM_RELEASE)) {
+                       RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, 0, release)",
+                               requested_addr);
+                       return NULL;
+               }
+       }
+
+       if (socket_id != SOCKET_ID_ANY) {
+               param_count = 1;
+               memset(&param, 0, sizeof(param));
+               param.Type = MemExtendedParameterNumaNode;
+               param.ULong = eal_socket_numa_node(socket_id);
+       }
+
+       flags = MEM_RESERVE | MEM_COMMIT | MEM_LARGE_PAGES;
+       addr = VirtualAlloc2(process, requested_addr, size,
+               flags, PAGE_READWRITE, &param, param_count);
+       if (addr == NULL) {
+               /* Logging may overwrite GetLastError() result. */
+               DWORD err = GetLastError();
+               RTE_LOG_WIN32_ERR("VirtualAlloc2(%p, %zu, commit large pages)",
+                       requested_addr, size);
+               set_errno_from_win32_alloc_error(err);
+               return NULL;
+       }
+
+       if ((requested_addr != NULL) && (addr != requested_addr)) {
+               /* We lost the race for the requested_addr. */
+               if (!VirtualFreeEx(process, addr, 0, MEM_RELEASE))
+                       RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, release)", addr);
+
+               rte_errno = EADDRNOTAVAIL;
+               return NULL;
+       }
+
+       return addr;
+}
+
+int
+eal_mem_decommit(void *addr, size_t size)
+{
+       HANDLE process;
+       void *stub;
+       DWORD flags;
+
+       process = GetCurrentProcess();
+
+       /* Hugepages cannot be decommited on Windows,
+        * so free them and replace the block with a placeholder.
+        * There is a race for VA in this block until VirtualAlloc2 call.
+        */
+       if (!VirtualFreeEx(process, addr, 0, MEM_RELEASE)) {
+               RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, 0, release)", addr);
+               return -1;
+       }
+
+       flags = MEM_RESERVE | MEM_RESERVE_PLACEHOLDER;
+       stub = VirtualAlloc2(
+               process, addr, size, flags, PAGE_NOACCESS, NULL, 0);
+       if (stub == NULL) {
+               /* We lost the race for the VA. */
+               if (!VirtualFreeEx(process, stub, 0, MEM_RELEASE))
+                       RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, release)", stub);
+               rte_errno = EADDRNOTAVAIL;
+               return -1;
+       }
+
+       /* No need to join reserved regions adjacent to the freed one:
+        * eal_mem_commit() will just pick up the page-size placeholder
+        * created here.
+        */
+       return 0;
+}
+
+/**
+ * Free a reserved memory region in full or in part.
+ *
+ * @param addr
+ *  Starting address of the area to free.
+ * @param size
+ *  Number of bytes to free. Must be a multiple of page size.
+ * @param reserved
+ *  Fail if the region is not in reserved state.
+ * @return
+ *  * 0 on successful deallocation;
+ *  * 1 if region must be in reserved state but it is not;
+ *  * (-1) on system API failures.
+ */
+static int
+mem_free(void *addr, size_t size, bool reserved)
+{
+       MEMORY_BASIC_INFORMATION info;
+       HANDLE process;
+
+       process = GetCurrentProcess();
+
+       if (VirtualQueryEx(
+                       process, addr, &info, sizeof(info)) != sizeof(info)) {
+               RTE_LOG_WIN32_ERR("VirtualQueryEx(%p)", addr);
+               return -1;
+       }
+
+       if (reserved && (info.State != MEM_RESERVE))
+               return 1;
+
+       /* Free complete region. */
+       if ((addr == info.AllocationBase) && (size == info.RegionSize)) {
+               if (!VirtualFreeEx(process, addr, 0, MEM_RELEASE)) {
+                       RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, 0, release)",
+                               addr);
+               }
+               return 0;
+       }
+
+       /* Split the part to be freed and the remaining reservation. */
+       if (!VirtualFreeEx(process, addr, size,
+                       MEM_RELEASE | MEM_PRESERVE_PLACEHOLDER)) {
+               RTE_LOG_WIN32_ERR(
+                       "VirtualFreeEx(%p, %zu, preserve placeholder)",
+                       addr, size);
+               return -1;
+       }
+
+       /* Actually free reservation part. */
+       if (!VirtualFreeEx(process, addr, 0, MEM_RELEASE)) {
+               RTE_LOG_WIN32_ERR("VirtualFreeEx(%p, 0, release)", addr);
+               return -1;
+       }
+
+       return 0;
+}
+
+void
+eal_mem_free(void *virt, size_t size)
+{
+       mem_free(virt, size, false);
+}
+
+int
+eal_mem_set_dump(void *virt, size_t size, bool dump)
+{
+       RTE_SET_USED(virt);
+       RTE_SET_USED(size);
+       RTE_SET_USED(dump);
+
+       /* Windows does not dump reserved memory by default.
+        *
+        * There is <werapi.h> to include or exclude regions from the dump,
+        * but this is not currently required by EAL.
+        */
+
+       rte_errno = ENOTSUP;
+       return -1;
+}
+
+void *
+rte_mem_map(void *requested_addr, size_t size, int prot, int flags,
+       int fd, size_t offset)
+{
+       HANDLE file_handle = INVALID_HANDLE_VALUE;
+       HANDLE mapping_handle = INVALID_HANDLE_VALUE;
+       DWORD sys_prot = 0;
+       DWORD sys_access = 0;
+       DWORD size_high = (DWORD)(size >> 32);
+       DWORD size_low = (DWORD)size;
+       DWORD offset_high = (DWORD)(offset >> 32);
+       DWORD offset_low = (DWORD)offset;
+       LPVOID virt = NULL;
+
+       if (prot & RTE_PROT_EXECUTE) {
+               if (prot & RTE_PROT_READ) {
+                       sys_prot = PAGE_EXECUTE_READ;
+                       sys_access = FILE_MAP_READ | FILE_MAP_EXECUTE;
+               }
+               if (prot & RTE_PROT_WRITE) {
+                       sys_prot = PAGE_EXECUTE_READWRITE;
+                       sys_access = FILE_MAP_WRITE | FILE_MAP_EXECUTE;
+               }
+       } else {
+               if (prot & RTE_PROT_READ) {
+                       sys_prot = PAGE_READONLY;
+                       sys_access = FILE_MAP_READ;
+               }
+               if (prot & RTE_PROT_WRITE) {
+                       sys_prot = PAGE_READWRITE;
+                       sys_access = FILE_MAP_WRITE;
+               }
+       }
+
+       if (flags & RTE_MAP_PRIVATE)
+               sys_access |= FILE_MAP_COPY;
+
+       if ((flags & RTE_MAP_ANONYMOUS) == 0)
+               file_handle = (HANDLE)_get_osfhandle(fd);
+
+       mapping_handle = CreateFileMapping(
+               file_handle, NULL, sys_prot, size_high, size_low, NULL);
+       if (mapping_handle == INVALID_HANDLE_VALUE) {
+               RTE_LOG_WIN32_ERR("CreateFileMapping()");
+               return NULL;
+       }
+
+       /* There is a race for the requested_addr between mem_free()
+        * and MapViewOfFileEx(). MapViewOfFile3() that can replace a reserved
+        * region with a mapping in a single operation, but it does not support
+        * private mappings.
+        */
+       if (requested_addr != NULL) {
+               int ret = mem_free(requested_addr, size, true);
+               if (ret) {
+                       if (ret > 0) {
+                               RTE_LOG(ERR, EAL, "Cannot map memory "
+                                       "to a region not reserved\n");
+                               rte_errno = EADDRNOTAVAIL;
+                       }
+                       return NULL;
+               }
+       }
+
+       virt = MapViewOfFileEx(mapping_handle, sys_access,
+               offset_high, offset_low, size, requested_addr);
+       if (!virt) {
+               RTE_LOG_WIN32_ERR("MapViewOfFileEx()");
+               return NULL;
+       }
+
+       if ((flags & RTE_MAP_FORCE_ADDRESS) && (virt != requested_addr)) {
+               if (!UnmapViewOfFile(virt))
+                       RTE_LOG_WIN32_ERR("UnmapViewOfFile()");
+               virt = NULL;
+       }
+
+       if (!CloseHandle(mapping_handle))
+               RTE_LOG_WIN32_ERR("CloseHandle()");
+
+       return virt;
+}
+
+int
+rte_mem_unmap(void *virt, size_t size)
+{
+       RTE_SET_USED(size);
+
+       if (!UnmapViewOfFile(virt)) {
+               RTE_LOG_WIN32_ERR("UnmapViewOfFile()");
+               rte_errno = EINVAL;
+               return -1;
+       }
+       return 0;
+}
+
+uint64_t
+eal_get_baseaddr(void)
+{
+       /* Windows strategy for memory allocation is undocumented.
+        * Returning 0 here effectively disables address guessing
+        * unless user provides an address hint.
+        */
+       return 0;
+}
+
+size_t
+rte_mem_page_size(void)
+{
+       static SYSTEM_INFO info;
+
+       if (info.dwPageSize == 0)
+               GetSystemInfo(&info);
+
+       return info.dwPageSize;
+}
+
+int
+rte_mem_lock(const void *virt, size_t size)
+{
+       /* VirtualLock() takes `void*`, work around compiler warning. */
+       void *addr = (void *)((uintptr_t)virt);
+
+       if (!VirtualLock(addr, size)) {
+               RTE_LOG_WIN32_ERR("VirtualLock(%p %#zx)", virt, size);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+rte_eal_memseg_init(void)
+{
+       if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
+               EAL_LOG_NOT_IMPLEMENTED();
+               return -1;
+       }
+
+       return eal_dynmem_memseg_lists_init();
+}
+
+static int
+eal_nohuge_init(void)
+{
+       struct rte_mem_config *mcfg;
+       struct rte_memseg_list *msl;
+       int n_segs;
+       uint64_t mem_sz, page_sz;
+       void *addr;
+
+       mcfg = rte_eal_get_configuration()->mem_config;
+
+       /* nohuge mode is legacy mode */
+       internal_config.legacy_mem = 1;
+
+       msl = &mcfg->memsegs[0];
+
+       mem_sz = internal_config.memory;
+       page_sz = RTE_PGSIZE_4K;
+       n_segs = mem_sz / page_sz;
+
+       if (eal_memseg_list_init_named(
+                       msl, "nohugemem", page_sz, n_segs, 0, true)) {
+               return -1;
+       }
+
+       addr = VirtualAlloc(
+               NULL, mem_sz, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+       if (addr == NULL) {
+               RTE_LOG_WIN32_ERR("VirtualAlloc(size=%#zx)", mem_sz);
+               RTE_LOG(ERR, EAL, "Cannot allocate memory\n");
+               return -1;
+       }
+
+       msl->base_va = addr;
+       msl->len = mem_sz;
+
+       eal_memseg_list_populate(msl, addr, n_segs);
+
+       if (mcfg->dma_maskbits &&
+               rte_mem_check_dma_mask_thread_unsafe(mcfg->dma_maskbits)) {
+               RTE_LOG(ERR, EAL,
+                       "%s(): couldn't allocate memory due to IOVA "
+                       "exceeding limits of current DMA mask.\n", __func__);
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+rte_eal_hugepage_init(void)
+{
+       return internal_config.no_hugetlbfs ?
+               eal_nohuge_init() : eal_dynmem_hugepage_init();
+}
+
+int
+rte_eal_hugepage_attach(void)
+{
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
diff --git a/lib/librte_eal/windows/eal_mp.c b/lib/librte_eal/windows/eal_mp.c
new file mode 100644 (file)
index 0000000..16a5e8b
--- /dev/null
@@ -0,0 +1,103 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+/**
+ * @file Multiprocess support stubs
+ *
+ * Stubs must log an error until implemented. If success is required
+ * for non-multiprocess operation, stub must log a warning and a comment
+ * must document what requires success emulation.
+ */
+
+#include <rte_eal.h>
+#include <rte_errno.h>
+
+#include "eal_private.h"
+#include "eal_windows.h"
+#include "malloc_mp.h"
+
+void
+rte_mp_channel_cleanup(void)
+{
+       EAL_LOG_NOT_IMPLEMENTED();
+}
+
+int
+rte_mp_action_register(const char *name, rte_mp_t action)
+{
+       RTE_SET_USED(name);
+       RTE_SET_USED(action);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+void
+rte_mp_action_unregister(const char *name)
+{
+       RTE_SET_USED(name);
+       EAL_LOG_NOT_IMPLEMENTED();
+}
+
+int
+rte_mp_sendmsg(struct rte_mp_msg *msg)
+{
+       RTE_SET_USED(msg);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+rte_mp_request_sync(struct rte_mp_msg *req, struct rte_mp_reply *reply,
+       const struct timespec *ts)
+{
+       RTE_SET_USED(req);
+       RTE_SET_USED(reply);
+       RTE_SET_USED(ts);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+rte_mp_request_async(struct rte_mp_msg *req, const struct timespec *ts,
+               rte_mp_async_reply_t clb)
+{
+       RTE_SET_USED(req);
+       RTE_SET_USED(ts);
+       RTE_SET_USED(clb);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+rte_mp_reply(struct rte_mp_msg *msg, const char *peer)
+{
+       RTE_SET_USED(msg);
+       RTE_SET_USED(peer);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+register_mp_requests(void)
+{
+       /* Non-stub function succeeds if multi-process is not supported. */
+       EAL_LOG_STUB();
+       return 0;
+}
+
+int
+request_to_primary(struct malloc_mp_req *req)
+{
+       RTE_SET_USED(req);
+       EAL_LOG_NOT_IMPLEMENTED();
+       return -1;
+}
+
+int
+request_sync(void)
+{
+       /* Common memory allocator depends on this function success. */
+       EAL_LOG_STUB();
+       return 0;
+}
index 96f97e5..d48ee0a 100644 (file)
 #include <rte_errno.h>
 #include <rte_windows.h>
 
+/**
+ * Log current function as not implemented and set rte_errno.
+ */
+#define EAL_LOG_NOT_IMPLEMENTED() \
+       do { \
+               RTE_LOG(DEBUG, EAL, "%s() is not implemented\n", __func__); \
+               rte_errno = ENOTSUP; \
+       } while (0)
+
+/**
+ * Log current function as a stub.
+ */
+#define EAL_LOG_STUB() \
+       RTE_LOG(DEBUG, EAL, "Windows: %s() is a stub\n", __func__)
+
 /**
  * Create a map of processors and cores on the system.
  *
@@ -40,4 +55,63 @@ int eal_thread_create(pthread_t *thread);
  */
 unsigned int eal_socket_numa_node(unsigned int socket_id);
 
+/**
+ * Open virt2phys driver interface device.
+ *
+ * @return 0 on success, (-1) on failure.
+ */
+int eal_mem_virt2iova_init(void);
+
+/**
+ * Locate Win32 memory management routines in system libraries.
+ *
+ * @return 0 on success, (-1) on failure.
+ */
+int eal_mem_win32api_init(void);
+
+/**
+ * Allocate new memory in hugepages on the specified NUMA node.
+ *
+ * @param size
+ *  Number of bytes to allocate. Must be a multiple of huge page size.
+ * @param socket_id
+ *  Socket ID.
+ * @return
+ *  Address of the memory allocated on success or NULL on failure.
+ */
+void *eal_mem_alloc_socket(size_t size, int socket_id);
+
+/**
+ * Commit memory previously reserved with eal_mem_reserve()
+ * or decommitted from hugepages by eal_mem_decommit().
+ *
+ * @param requested_addr
+ *  Address within a reserved region. Must not be NULL.
+ * @param size
+ *  Number of bytes to commit. Must be a multiple of page size.
+ * @param socket_id
+ *  Socket ID to allocate on. Can be SOCKET_ID_ANY.
+ * @return
+ *  On success, address of the committed memory, that is, requested_addr.
+ *  On failure, NULL and rte_errno is set.
+ */
+void *eal_mem_commit(void *requested_addr, size_t size, int socket_id);
+
+/**
+ * Put allocated or committed memory back into reserved state.
+ *
+ * @param addr
+ *  Address of the region to decommit.
+ * @param size
+ *  Number of bytes to decommit, must be the size of a page
+ *  (hugepage or regular one).
+ *
+ * The *addr* and *size* must match location and size
+ * of a previously allocated or committed region.
+ *
+ * @return
+ *  0 on success, (-1) on failure.
+ */
+int eal_mem_decommit(void *addr, size_t size);
+
 #endif /* _EAL_WINDOWS_H_ */
index 5fb1962..b3534b0 100644 (file)
@@ -5,5 +5,6 @@ includes += include_directories('.')
 
 headers += files(
         'rte_os.h',
+        'rte_virt2phys.h',
         'rte_windows.h',
 )
index 510e39e..cb10d64 100644 (file)
@@ -14,6 +14,7 @@
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,6 +37,9 @@ extern "C" {
 
 #define strncasecmp(s1, s2, count)        _strnicmp(s1, s2, count)
 
+#define close _close
+#define unlink _unlink
+
 /* cpu_set macros implementation */
 #define RTE_CPU_AND(dst, src1, src2) CPU_AND(dst, src1, src2)
 #define RTE_CPU_OR(dst, src1, src2) CPU_OR(dst, src1, src2)
@@ -46,6 +50,7 @@ extern "C" {
 typedef long long ssize_t;
 
 #ifndef RTE_TOOLCHAIN_GCC
+
 static inline int
 asprintf(char **buffer, const char *format, ...)
 {
@@ -72,6 +77,18 @@ asprintf(char **buffer, const char *format, ...)
        }
        return ret;
 }
+
+static inline const char *
+eal_strerror(int code)
+{
+       static char buffer[128];
+
+       strerror_s(buffer, sizeof(buffer), code);
+       return buffer;
+}
+
+#define strerror eal_strerror
+
 #endif /* RTE_TOOLCHAIN_GCC */
 
 #ifdef __cplusplus
diff --git a/lib/librte_eal/windows/include/rte_virt2phys.h b/lib/librte_eal/windows/include/rte_virt2phys.h
new file mode 100644 (file)
index 0000000..4bb2b4a
--- /dev/null
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (c) 2020 Dmitry Kozlyuk
+ */
+
+/**
+ * @file virt2phys driver interface
+ */
+
+/**
+ * Driver device interface GUID {539c2135-793a-4926-afec-d3a1b61bbc8a}.
+ */
+DEFINE_GUID(GUID_DEVINTERFACE_VIRT2PHYS,
+       0x539c2135, 0x793a, 0x4926,
+       0xaf, 0xec, 0xd3, 0xa1, 0xb6, 0x1b, 0xbc, 0x8a);
+
+/**
+ * Driver device type for IO control codes.
+ */
+#define VIRT2PHYS_DEVTYPE 0x8000
+
+/**
+ * Translate a valid non-paged virtual address to a physical address.
+ *
+ * Note: A physical address zero (0) is reported if input address
+ * is paged out or not mapped. However, if input is a valid mapping
+ * of I/O port 0x0000, output is also zero. There is no way
+ * to distinguish between these cases by return value only.
+ *
+ * Input: a non-paged virtual address (PVOID).
+ *
+ * Output: the corresponding physical address (LARGE_INTEGER).
+ */
+#define IOCTL_VIRT2PHYS_TRANSLATE CTL_CODE( \
+       VIRT2PHYS_DEVTYPE, 0x800, METHOD_BUFFERED, FILE_ANY_ACCESS)
index ed6e4c1..899ed7d 100644 (file)
@@ -23,6 +23,8 @@
 
 #include <basetsd.h>
 #include <psapi.h>
+#include <setupapi.h>
+#include <winioctl.h>
 
 /* Have GUIDs defined. */
 #ifndef INITGUID
index 757b7f3..6b33005 100644 (file)
@@ -9,4 +9,7 @@
  * as Microsoft libc does not contain unistd.h. This may be removed
  * in future releases.
  */
+
+#include <io.h>
+
 #endif /* _UNISTD_H_ */
index 52978e9..ded5a2b 100644 (file)
@@ -6,10 +6,16 @@ subdir('include')
 sources += files(
        'eal.c',
        'eal_debug.c',
+       'eal_file.c',
        'eal_hugepages.c',
        'eal_lcore.c',
        'eal_log.c',
+       'eal_memalloc.c',
+       'eal_memory.c',
+       'eal_mp.c',
        'eal_thread.c',
        'fnmatch.c',
        'getopt.c',
 )
+
+dpdk_conf.set10('RTE_EAL_NUMA_AWARE_HUGEPAGES', true)