lib/librte_eal/linuxapp/eal/eal_memalloc.c

   1 /* SPDX-License-Identifier: BSD-3-Clause
   2  * Copyright(c) 2017-2018 Intel Corporation
   3  */
   4
   5 #define _FILE_OFFSET_BITS 64
   6 #include <errno.h>
   7 #include <stdarg.h>
   8 #include <stdbool.h>
   9 #include <stdlib.h>
  10 #include <stdio.h>
  11 #include <stdint.h>
  12 #include <inttypes.h>
  13 #include <string.h>
  14 #include <sys/mman.h>
  15 #include <sys/types.h>
  16 #include <sys/stat.h>
  17 #include <sys/queue.h>
  18 #include <sys/file.h>
  19 #include <unistd.h>
  20 #include <limits.h>
  21 #include <fcntl.h>
  22 #include <sys/ioctl.h>
  23 #include <sys/time.h>
  24 #include <signal.h>
  25 #include <setjmp.h>
  26 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
  27 #include <numa.h>
  28 #include <numaif.h>
  29 #endif
  30 #include <linux/falloc.h>
  31 #include <linux/mman.h> /* for hugetlb-related mmap flags */
  32
  33 #include <rte_common.h>
  34 #include <rte_log.h>
  35 #include <rte_eal_memconfig.h>
  36 #include <rte_eal.h>
  37 #include <rte_errno.h>
  38 #include <rte_memory.h>
  39 #include <rte_spinlock.h>
  40
  41 #include "eal_filesystem.h"
  42 #include "eal_internal_cfg.h"
  43 #include "eal_memalloc.h"
  44 #include "eal_private.h"
  45
  46 const int anonymous_hugepages_supported =
  47 #ifdef MAP_HUGE_SHIFT
  48                 1;
  49 #define RTE_MAP_HUGE_SHIFT MAP_HUGE_SHIFT
  50 #else
  51                 0;
  52 #define RTE_MAP_HUGE_SHIFT 26
  53 #endif
  54
  55 /*
  56  * not all kernel version support fallocate on hugetlbfs, so fall back to
  57  * ftruncate and disallow deallocation if fallocate is not supported.
  58  */
  59 static int fallocate_supported = -1; /* unknown */
  60
  61 /*
  62  * we have two modes - single file segments, and file-per-page mode.
  63  *
  64  * for single-file segments, we need some kind of mechanism to keep track of
  65  * which hugepages can be freed back to the system, and which cannot. we cannot
  66  * use flock() because they don't allow locking parts of a file, and we cannot
  67  * use fcntl() due to issues with their semantics, so we will have to rely on a
  68  * bunch of lockfiles for each page. so, we will use 'fds' array to keep track
  69  * of per-page lockfiles. we will store the actual segment list fd in the
  70  * 'memseg_list_fd' field.
  71  *
  72  * for file-per-page mode, each page will have its own fd, so 'memseg_list_fd'
  73  * will be invalid (set to -1), and we'll use 'fds' to keep track of page fd's.
  74  *
  75  * we cannot know how many pages a system will have in advance, but we do know
  76  * that they come in lists, and we know lengths of these lists. so, simply store
  77  * a malloc'd array of fd's indexed by list and segment index.
  78  *
  79  * they will be initialized at startup, and filled as we allocate/deallocate
  80  * segments.
  81  */
  82 static struct {
  83         int *fds; /**< dynamically allocated array of segment lock fd's */
  84         int memseg_list_fd; /**< memseg list fd */
  85         int len; /**< total length of the array */
  86         int count; /**< entries used in an array */
  87 } fd_list[RTE_MAX_MEMSEG_LISTS];
  88
  89 /** local copy of a memory map, used to synchronize memory hotplug in MP */
  90 static struct rte_memseg_list local_memsegs[RTE_MAX_MEMSEG_LISTS];
  91
  92 static sigjmp_buf huge_jmpenv;
  93
  94 static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
  95 {
  96         siglongjmp(huge_jmpenv, 1);
  97 }
  98
  99 /* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
 100  * non-static local variable in the stack frame calling sigsetjmp might be
 101  * clobbered by a call to longjmp.
 102  */
 103 static int __rte_unused huge_wrap_sigsetjmp(void)
 104 {
 105         return sigsetjmp(huge_jmpenv, 1);
 106 }
 107
 108 static struct sigaction huge_action_old;
 109 static int huge_need_recover;
 110
 111 static void __rte_unused
 112 huge_register_sigbus(void)
 113 {
 114         sigset_t mask;
 115         struct sigaction action;
 116
 117         sigemptyset(&mask);
 118         sigaddset(&mask, SIGBUS);
 119         action.sa_flags = 0;
 120         action.sa_mask = mask;
 121         action.sa_handler = huge_sigbus_handler;
 122
 123         huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
 124 }
 125
 126 static void __rte_unused
 127 huge_recover_sigbus(void)
 128 {
 129         if (huge_need_recover) {
 130                 sigaction(SIGBUS, &huge_action_old, NULL);
 131                 huge_need_recover = 0;
 132         }
 133 }
 134
 135 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 136 static bool
 137 check_numa(void)
 138 {
 139         bool ret = true;
 140         /* Check if kernel supports NUMA. */
 141         if (numa_available() != 0) {
 142                 RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
 143                 ret = false;
 144         }
 145         return ret;
 146 }
 147
 148 static void
 149 prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id)
 150 {
 151         RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
 152         if (get_mempolicy(oldpolicy, oldmask->maskp,
 153                           oldmask->size + 1, 0, 0) < 0) {
 154                 RTE_LOG(ERR, EAL,
 155                         "Failed to get current mempolicy: %s. "
 156                         "Assuming MPOL_DEFAULT.\n", strerror(errno));
 157                 oldpolicy = MPOL_DEFAULT;
 158         }
 159         RTE_LOG(DEBUG, EAL,
 160                 "Setting policy MPOL_PREFERRED for socket %d\n",
 161                 socket_id);
 162         numa_set_preferred(socket_id);
 163 }
 164
 165 static void
 166 restore_numa(int *oldpolicy, struct bitmask *oldmask)
 167 {
 168         RTE_LOG(DEBUG, EAL,
 169                 "Restoring previous memory policy: %d\n", *oldpolicy);
 170         if (*oldpolicy == MPOL_DEFAULT) {
 171                 numa_set_localalloc();
 172         } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
 173                                  oldmask->size + 1) < 0) {
 174                 RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
 175                         strerror(errno));
 176                 numa_set_localalloc();
 177         }
 178         numa_free_cpumask(oldmask);
 179 }
 180 #endif
 181
 182 /*
 183  * uses fstat to report the size of a file on disk
 184  */
 185 static off_t
 186 get_file_size(int fd)
 187 {
 188         struct stat st;
 189         if (fstat(fd, &st) < 0)
 190                 return 0;
 191         return st.st_size;
 192 }
 193
 194 /* returns 1 on successful lock, 0 on unsuccessful lock, -1 on error */
 195 static int lock(int fd, int type)
 196 {
 197         int ret;
 198
 199         /* flock may be interrupted */
 200         do {
 201                 ret = flock(fd, type | LOCK_NB);
 202         } while (ret && errno == EINTR);
 203
 204         if (ret && errno == EWOULDBLOCK) {
 205                 /* couldn't lock */
 206                 return 0;
 207         } else if (ret) {
 208                 RTE_LOG(ERR, EAL, "%s(): error calling flock(): %s\n",
 209                         __func__, strerror(errno));
 210                 return -1;
 211         }
 212         /* lock was successful */
 213         return 1;
 214 }
 215
 216 static int get_segment_lock_fd(int list_idx, int seg_idx)
 217 {
 218         char path[PATH_MAX] = {0};
 219         int fd;
 220
 221         if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
 222                 return -1;
 223         if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
 224                 return -1;
 225
 226         fd = fd_list[list_idx].fds[seg_idx];
 227         /* does this lock already exist? */
 228         if (fd >= 0)
 229                 return fd;
 230
 231         eal_get_hugefile_lock_path(path, sizeof(path),
 232                         list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
 233
 234         fd = open(path, O_CREAT | O_RDWR, 0660);
 235         if (fd < 0) {
 236                 RTE_LOG(ERR, EAL, "%s(): error creating lockfile '%s': %s\n",
 237                         __func__, path, strerror(errno));
 238                 return -1;
 239         }
 240         /* take out a read lock */
 241         if (lock(fd, LOCK_SH) != 1) {
 242                 RTE_LOG(ERR, EAL, "%s(): failed to take out a readlock on '%s': %s\n",
 243                         __func__, path, strerror(errno));
 244                 close(fd);
 245                 return -1;
 246         }
 247         /* store it for future reference */
 248         fd_list[list_idx].fds[seg_idx] = fd;
 249         fd_list[list_idx].count++;
 250         return fd;
 251 }
 252
 253 static int unlock_segment(int list_idx, int seg_idx)
 254 {
 255         int fd, ret;
 256
 257         if (list_idx < 0 || list_idx >= (int)RTE_DIM(fd_list))
 258                 return -1;
 259         if (seg_idx < 0 || seg_idx >= fd_list[list_idx].len)
 260                 return -1;
 261
 262         fd = fd_list[list_idx].fds[seg_idx];
 263
 264         /* upgrade lock to exclusive to see if we can remove the lockfile */
 265         ret = lock(fd, LOCK_EX);
 266         if (ret == 1) {
 267                 /* we've succeeded in taking exclusive lock, this lockfile may
 268                  * be removed.
 269                  */
 270                 char path[PATH_MAX] = {0};
 271                 eal_get_hugefile_lock_path(path, sizeof(path),
 272                                 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
 273                 if (unlink(path)) {
 274                         RTE_LOG(ERR, EAL, "%s(): error removing lockfile '%s': %s\n",
 275                                         __func__, path, strerror(errno));
 276                 }
 277         }
 278         /* we don't want to leak the fd, so even if we fail to lock, close fd
 279          * and remove it from list anyway.
 280          */
 281         close(fd);
 282         fd_list[list_idx].fds[seg_idx] = -1;
 283         fd_list[list_idx].count--;
 284
 285         if (ret < 0)
 286                 return -1;
 287         return 0;
 288 }
 289
 290 static int
 291 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 292                 unsigned int list_idx, unsigned int seg_idx)
 293 {
 294         int fd;
 295
 296         if (internal_config.single_file_segments) {
 297                 /* create a hugepage file path */
 298                 eal_get_hugefile_path(path, buflen, hi->hugedir, list_idx);
 299
 300                 fd = fd_list[list_idx].memseg_list_fd;
 301
 302                 if (fd < 0) {
 303                         fd = open(path, O_CREAT | O_RDWR, 0600);
 304                         if (fd < 0) {
 305                                 RTE_LOG(ERR, EAL, "%s(): open failed: %s\n",
 306                                         __func__, strerror(errno));
 307                                 return -1;
 308                         }
 309                         /* take out a read lock and keep it indefinitely */
 310                         if (lock(fd, LOCK_SH) < 0) {
 311                                 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
 312                                         __func__, strerror(errno));
 313                                 close(fd);
 314                                 return -1;
 315                         }
 316                         fd_list[list_idx].memseg_list_fd = fd;
 317                 }
 318         } else {
 319                 /* create a hugepage file path */
 320                 eal_get_hugefile_path(path, buflen, hi->hugedir,
 321                                 list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
 322
 323                 fd = fd_list[list_idx].fds[seg_idx];
 324
 325                 if (fd < 0) {
 326                         fd = open(path, O_CREAT | O_RDWR, 0600);
 327                         if (fd < 0) {
 328                                 RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n",
 329                                         __func__, strerror(errno));
 330                                 return -1;
 331                         }
 332                         /* take out a read lock */
 333                         if (lock(fd, LOCK_SH) < 0) {
 334                                 RTE_LOG(ERR, EAL, "%s(): lock failed: %s\n",
 335                                         __func__, strerror(errno));
 336                                 close(fd);
 337                                 return -1;
 338                         }
 339                         fd_list[list_idx].fds[seg_idx] = fd;
 340                 }
 341         }
 342         return fd;
 343 }
 344
 345 static int
 346 resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
 347                 uint64_t fa_offset, uint64_t page_sz, bool grow)
 348 {
 349         bool again = false;
 350         do {
 351                 if (fallocate_supported == 0) {
 352                         /* we cannot deallocate memory if fallocate() is not
 353                          * supported, and hugepage file is already locked at
 354                          * creation, so no further synchronization needed.
 355                          */
 356
 357                         if (!grow) {
 358                                 RTE_LOG(DEBUG, EAL, "%s(): fallocate not supported, not freeing page back to the system\n",
 359                                         __func__);
 360                                 return -1;
 361                         }
 362                         uint64_t new_size = fa_offset + page_sz;
 363                         uint64_t cur_size = get_file_size(fd);
 364
 365                         /* fallocate isn't supported, fall back to ftruncate */
 366                         if (new_size > cur_size &&
 367                                         ftruncate(fd, new_size) < 0) {
 368                                 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
 369                                         __func__, strerror(errno));
 370                                 return -1;
 371                         }
 372                 } else {
 373                         int flags = grow ? 0 : FALLOC_FL_PUNCH_HOLE |
 374                                         FALLOC_FL_KEEP_SIZE;
 375                         int ret, lock_fd;
 376
 377                         /* if fallocate() is supported, we need to take out a
 378                          * read lock on allocate (to prevent other processes
 379                          * from deallocating this page), and take out a write
 380                          * lock on deallocate (to ensure nobody else is using
 381                          * this page).
 382                          *
 383                          * read locks on page itself are already taken out at
 384                          * file creation, in get_seg_fd().
 385                          *
 386                          * we cannot rely on simple use of flock() call, because
 387                          * we need to be able to lock a section of the file,
 388                          * and we cannot use fcntl() locks, because of numerous
 389                          * problems with their semantics, so we will use
 390                          * deterministically named lock files for each section
 391                          * of the file.
 392                          *
 393                          * if we're shrinking the file, we want to upgrade our
 394                          * lock from shared to exclusive.
 395                          *
 396                          * lock_fd is an fd for a lockfile, not for the segment
 397                          * list.
 398                          */
 399                         lock_fd = get_segment_lock_fd(list_idx, seg_idx);
 400
 401                         if (!grow) {
 402                                 /* we are using this lockfile to determine
 403                                  * whether this particular page is locked, as we
 404                                  * are in single file segments mode and thus
 405                                  * cannot use regular flock() to get this info.
 406                                  *
 407                                  * we want to try and take out an exclusive lock
 408                                  * on the lock file to determine if we're the
 409                                  * last ones using this page, and if not, we
 410                                  * won't be shrinking it, and will instead exit
 411                                  * prematurely.
 412                                  */
 413                                 ret = lock(lock_fd, LOCK_EX);
 414
 415                                 /* drop the lock on the lockfile, so that even
 416                                  * if we couldn't shrink the file ourselves, we
 417                                  * are signalling to other processes that we're
 418                                  * no longer using this page.
 419                                  */
 420                                 if (unlock_segment(list_idx, seg_idx))
 421                                         RTE_LOG(ERR, EAL, "Could not unlock segment\n");
 422
 423                                 /* additionally, if this was the last lock on
 424                                  * this segment list, we can safely close the
 425                                  * page file fd, so that one of the processes
 426                                  * could then delete the file after shrinking.
 427                                  */
 428                                 if (ret < 1 && fd_list[list_idx].count == 0) {
 429                                         close(fd);
 430                                         fd_list[list_idx].memseg_list_fd = -1;
 431                                 }
 432
 433                                 if (ret < 0) {
 434                                         RTE_LOG(ERR, EAL, "Could not lock segment\n");
 435                                         return -1;
 436                                 }
 437                                 if (ret == 0)
 438                                         /* failed to lock, not an error. */
 439                                         return 0;
 440                         }
 441
 442                         /* grow or shrink the file */
 443                         ret = fallocate(fd, flags, fa_offset, page_sz);
 444
 445                         if (ret < 0) {
 446                                 if (fallocate_supported == -1 &&
 447                                                 errno == ENOTSUP) {
 448                                         RTE_LOG(ERR, EAL, "%s(): fallocate() not supported, hugepage deallocation will be disabled\n",
 449                                                 __func__);
 450                                         again = true;
 451                                         fallocate_supported = 0;
 452                                 } else {
 453                                         RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
 454                                                 __func__,
 455                                                 strerror(errno));
 456                                         return -1;
 457                                 }
 458                         } else {
 459                                 fallocate_supported = 1;
 460
 461                                 /* we've grew/shrunk the file, and we hold an
 462                                  * exclusive lock now. check if there are no
 463                                  * more segments active in this segment list,
 464                                  * and remove the file if there aren't.
 465                                  */
 466                                 if (fd_list[list_idx].count == 0) {
 467                                         if (unlink(path))
 468                                                 RTE_LOG(ERR, EAL, "%s(): unlinking '%s' failed: %s\n",
 469                                                         __func__, path,
 470                                                         strerror(errno));
 471                                         close(fd);
 472                                         fd_list[list_idx].memseg_list_fd = -1;
 473                                 }
 474                         }
 475                 }
 476         } while (again);
 477         return 0;
 478 }
 479
 480 static int
 481 alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 482                 struct hugepage_info *hi, unsigned int list_idx,
 483                 unsigned int seg_idx)
 484 {
 485 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 486         int cur_socket_id = 0;
 487 #endif
 488         uint64_t map_offset;
 489         rte_iova_t iova;
 490         void *va;
 491         char path[PATH_MAX];
 492         int ret = 0;
 493         int fd;
 494         size_t alloc_sz;
 495         int flags;
 496         void *new_addr;
 497
 498         alloc_sz = hi->hugepage_sz;
 499         if (!internal_config.single_file_segments &&
 500                         internal_config.in_memory &&
 501                         anonymous_hugepages_supported) {
 502                 int log2, flags;
 503
 504                 log2 = rte_log2_u32(alloc_sz);
 505                 /* as per mmap() manpage, all page sizes are log2 of page size
 506                  * shifted by MAP_HUGE_SHIFT
 507                  */
 508                 flags = (log2 << RTE_MAP_HUGE_SHIFT) | MAP_HUGETLB | MAP_FIXED |
 509                                 MAP_PRIVATE | MAP_ANONYMOUS;
 510                 fd = -1;
 511                 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE, flags, -1, 0);
 512
 513                 /* single-file segments codepath will never be active because
 514                  * in-memory mode is incompatible with it and it's stopped at
 515                  * EAL initialization stage, however the compiler doesn't know
 516                  * that and complains about map_offset being used uninitialized
 517                  * on failure codepaths while having in-memory mode enabled. so,
 518                  * assign a value here.
 519                  */
 520                 map_offset = 0;
 521         } else {
 522                 /* takes out a read lock on segment or segment list */
 523                 fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
 524                 if (fd < 0) {
 525                         RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
 526                         return -1;
 527                 }
 528
 529                 if (internal_config.single_file_segments) {
 530                         map_offset = seg_idx * alloc_sz;
 531                         ret = resize_hugefile(fd, path, list_idx, seg_idx,
 532                                         map_offset, alloc_sz, true);
 533                         if (ret < 0)
 534                                 goto resized;
 535                 } else {
 536                         map_offset = 0;
 537                         if (ftruncate(fd, alloc_sz) < 0) {
 538                                 RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
 539                                         __func__, strerror(errno));
 540                                 goto resized;
 541                         }
 542                         if (internal_config.hugepage_unlink) {
 543                                 if (unlink(path)) {
 544                                         RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
 545                                                 __func__, strerror(errno));
 546                                         goto resized;
 547                                 }
 548                         }
 549                 }
 550
 551                 /*
 552                  * map the segment, and populate page tables, the kernel fills
 553                  * this segment with zeros if it's a new page.
 554                  */
 555                 va = mmap(addr, alloc_sz, PROT_READ | PROT_WRITE,
 556                                 MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd,
 557                                 map_offset);
 558         }
 559
 560         if (va == MAP_FAILED) {
 561                 RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
 562                         strerror(errno));
 563                 /* mmap failed, but the previous region might have been
 564                  * unmapped anyway. try to remap it
 565                  */
 566                 goto unmapped;
 567         }
 568         if (va != addr) {
 569                 RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
 570                 munmap(va, alloc_sz);
 571                 goto resized;
 572         }
 573
 574         /* In linux, hugetlb limitations, like cgroup, are
 575          * enforced at fault time instead of mmap(), even
 576          * with the option of MAP_POPULATE. Kernel will send
 577          * a SIGBUS signal. To avoid to be killed, save stack
 578          * environment here, if SIGBUS happens, we can jump
 579          * back here.
 580          */
 581         if (huge_wrap_sigsetjmp()) {
 582                 RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
 583                         (unsigned int)(alloc_sz >> 20));
 584                 goto mapped;
 585         }
 586
 587         /* we need to trigger a write to the page to enforce page fault and
 588          * ensure that page is accessible to us, but we can't overwrite value
 589          * that is already there, so read the old value, and write itback.
 590          * kernel populates the page with zeroes initially.
 591          */
 592         *(volatile int *)addr = *(volatile int *)addr;
 593
 594         iova = rte_mem_virt2iova(addr);
 595         if (iova == RTE_BAD_PHYS_ADDR) {
 596                 RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
 597                         __func__);
 598                 goto mapped;
 599         }
 600
 601 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 602         move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
 603
 604         if (cur_socket_id != socket_id) {
 605                 RTE_LOG(DEBUG, EAL,
 606                                 "%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
 607                         __func__, socket_id, cur_socket_id);
 608                 goto mapped;
 609         }
 610 #endif
 611
 612         ms->addr = addr;
 613         ms->hugepage_sz = alloc_sz;
 614         ms->len = alloc_sz;
 615         ms->nchannel = rte_memory_get_nchannel();
 616         ms->nrank = rte_memory_get_nrank();
 617         ms->iova = iova;
 618         ms->socket_id = socket_id;
 619
 620         return 0;
 621
 622 mapped:
 623         munmap(addr, alloc_sz);
 624 unmapped:
 625         flags = MAP_FIXED;
 626 #ifdef RTE_ARCH_PPC_64
 627         flags |= MAP_HUGETLB;
 628 #endif
 629         new_addr = eal_get_virtual_area(addr, &alloc_sz, alloc_sz, 0, flags);
 630         if (new_addr != addr) {
 631                 if (new_addr != NULL)
 632                         munmap(new_addr, alloc_sz);
 633                 /* we're leaving a hole in our virtual address space. if
 634                  * somebody else maps this hole now, we could accidentally
 635                  * override it in the future.
 636                  */
 637                 RTE_LOG(CRIT, EAL, "Can't mmap holes in our virtual address space\n");
 638         }
 639 resized:
 640         /* some codepaths will return negative fd, so exit early */
 641         if (fd < 0)
 642                 return -1;
 643
 644         if (internal_config.single_file_segments) {
 645                 resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
 646                                 alloc_sz, false);
 647                 /* ignore failure, can't make it any worse */
 648         } else {
 649                 /* only remove file if we can take out a write lock */
 650                 if (internal_config.hugepage_unlink == 0 &&
 651                                 internal_config.in_memory == 0 &&
 652                                 lock(fd, LOCK_EX) == 1)
 653                         unlink(path);
 654                 close(fd);
 655                 fd_list[list_idx].fds[seg_idx] = -1;
 656         }
 657         return -1;
 658 }
 659
 660 static int
 661 free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 662                 unsigned int list_idx, unsigned int seg_idx)
 663 {
 664         uint64_t map_offset;
 665         char path[PATH_MAX];
 666         int fd, ret;
 667
 668         /* erase page data */
 669         memset(ms->addr, 0, ms->len);
 670
 671         if (mmap(ms->addr, ms->len, PROT_READ,
 672                         MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) ==
 673                                 MAP_FAILED) {
 674                 RTE_LOG(DEBUG, EAL, "couldn't unmap page\n");
 675                 return -1;
 676         }
 677
 678         /* if we've already unlinked the page, nothing needs to be done */
 679         if (internal_config.hugepage_unlink) {
 680                 memset(ms, 0, sizeof(*ms));
 681                 return 0;
 682         }
 683
 684         /* if we are not in single file segments mode, we're going to unmap the
 685          * segment and thus drop the lock on original fd, but hugepage dir is
 686          * now locked so we can take out another one without races.
 687          */
 688         fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
 689         if (fd < 0)
 690                 return -1;
 691
 692         if (internal_config.single_file_segments) {
 693                 map_offset = seg_idx * ms->len;
 694                 if (resize_hugefile(fd, path, list_idx, seg_idx, map_offset,
 695                                 ms->len, false))
 696                         return -1;
 697                 ret = 0;
 698         } else {
 699                 /* if we're able to take out a write lock, we're the last one
 700                  * holding onto this page.
 701                  */
 702                 ret = lock(fd, LOCK_EX);
 703                 if (ret >= 0) {
 704                         /* no one else is using this page */
 705                         if (ret == 1)
 706                                 unlink(path);
 707                 }
 708                 /* closing fd will drop the lock */
 709                 close(fd);
 710                 fd_list[list_idx].fds[seg_idx] = -1;
 711         }
 712
 713         memset(ms, 0, sizeof(*ms));
 714
 715         return ret < 0 ? -1 : 0;
 716 }
 717
 718 struct alloc_walk_param {
 719         struct hugepage_info *hi;
 720         struct rte_memseg **ms;
 721         size_t page_sz;
 722         unsigned int segs_allocated;
 723         unsigned int n_segs;
 724         int socket;
 725         bool exact;
 726 };
 727 static int
 728 alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 729 {
 730         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 731         struct alloc_walk_param *wa = arg;
 732         struct rte_memseg_list *cur_msl;
 733         size_t page_sz;
 734         int cur_idx, start_idx, j, dir_fd = -1;
 735         unsigned int msl_idx, need, i;
 736
 737         if (msl->page_sz != wa->page_sz)
 738                 return 0;
 739         if (msl->socket_id != wa->socket)
 740                 return 0;
 741
 742         page_sz = (size_t)msl->page_sz;
 743
 744         msl_idx = msl - mcfg->memsegs;
 745         cur_msl = &mcfg->memsegs[msl_idx];
 746
 747         need = wa->n_segs;
 748
 749         /* try finding space in memseg list */
 750         cur_idx = rte_fbarray_find_next_n_free(&cur_msl->memseg_arr, 0, need);
 751         if (cur_idx < 0)
 752                 return 0;
 753         start_idx = cur_idx;
 754
 755         /* do not allow any page allocations during the time we're allocating,
 756          * because file creation and locking operations are not atomic,
 757          * and we might be the first or the last ones to use a particular page,
 758          * so we need to ensure atomicity of every operation.
 759          *
 760          * during init, we already hold a write lock, so don't try to take out
 761          * another one.
 762          */
 763         if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
 764                 dir_fd = open(wa->hi->hugedir, O_RDONLY);
 765                 if (dir_fd < 0) {
 766                         RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
 767                                 __func__, wa->hi->hugedir, strerror(errno));
 768                         return -1;
 769                 }
 770                 /* blocking writelock */
 771                 if (flock(dir_fd, LOCK_EX)) {
 772                         RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
 773                                 __func__, wa->hi->hugedir, strerror(errno));
 774                         close(dir_fd);
 775                         return -1;
 776                 }
 777         }
 778
 779         for (i = 0; i < need; i++, cur_idx++) {
 780                 struct rte_memseg *cur;
 781                 void *map_addr;
 782
 783                 cur = rte_fbarray_get(&cur_msl->memseg_arr, cur_idx);
 784                 map_addr = RTE_PTR_ADD(cur_msl->base_va,
 785                                 cur_idx * page_sz);
 786
 787                 if (alloc_seg(cur, map_addr, wa->socket, wa->hi,
 788                                 msl_idx, cur_idx)) {
 789                         RTE_LOG(DEBUG, EAL, "attempted to allocate %i segments, but only %i were allocated\n",
 790                                 need, i);
 791
 792                         /* if exact number wasn't requested, stop */
 793                         if (!wa->exact)
 794                                 goto out;
 795
 796                         /* clean up */
 797                         for (j = start_idx; j < cur_idx; j++) {
 798                                 struct rte_memseg *tmp;
 799                                 struct rte_fbarray *arr =
 800                                                 &cur_msl->memseg_arr;
 801
 802                                 tmp = rte_fbarray_get(arr, j);
 803                                 rte_fbarray_set_free(arr, j);
 804
 805                                 /* free_seg may attempt to create a file, which
 806                                  * may fail.
 807                                  */
 808                                 if (free_seg(tmp, wa->hi, msl_idx, j))
 809                                         RTE_LOG(DEBUG, EAL, "Cannot free page\n");
 810                         }
 811                         /* clear the list */
 812                         if (wa->ms)
 813                                 memset(wa->ms, 0, sizeof(*wa->ms) * wa->n_segs);
 814
 815                         if (dir_fd >= 0)
 816                                 close(dir_fd);
 817                         return -1;
 818                 }
 819                 if (wa->ms)
 820                         wa->ms[i] = cur;
 821
 822                 rte_fbarray_set_used(&cur_msl->memseg_arr, cur_idx);
 823         }
 824 out:
 825         wa->segs_allocated = i;
 826         if (i > 0)
 827                 cur_msl->version++;
 828         if (dir_fd >= 0)
 829                 close(dir_fd);
 830         return 1;
 831 }
 832
 833 struct free_walk_param {
 834         struct hugepage_info *hi;
 835         struct rte_memseg *ms;
 836 };
 837 static int
 838 free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 839 {
 840         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 841         struct rte_memseg_list *found_msl;
 842         struct free_walk_param *wa = arg;
 843         uintptr_t start_addr, end_addr;
 844         int msl_idx, seg_idx, ret, dir_fd = -1;
 845
 846         start_addr = (uintptr_t) msl->base_va;
 847         end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
 848
 849         if ((uintptr_t)wa->ms->addr < start_addr ||
 850                         (uintptr_t)wa->ms->addr >= end_addr)
 851                 return 0;
 852
 853         msl_idx = msl - mcfg->memsegs;
 854         seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
 855
 856         /* msl is const */
 857         found_msl = &mcfg->memsegs[msl_idx];
 858
 859         /* do not allow any page allocations during the time we're freeing,
 860          * because file creation and locking operations are not atomic,
 861          * and we might be the first or the last ones to use a particular page,
 862          * so we need to ensure atomicity of every operation.
 863          *
 864          * during init, we already hold a write lock, so don't try to take out
 865          * another one.
 866          */
 867         if (wa->hi->lock_descriptor == -1 && !internal_config.in_memory) {
 868                 dir_fd = open(wa->hi->hugedir, O_RDONLY);
 869                 if (dir_fd < 0) {
 870                         RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
 871                                 __func__, wa->hi->hugedir, strerror(errno));
 872                         return -1;
 873                 }
 874                 /* blocking writelock */
 875                 if (flock(dir_fd, LOCK_EX)) {
 876                         RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n",
 877                                 __func__, wa->hi->hugedir, strerror(errno));
 878                         close(dir_fd);
 879                         return -1;
 880                 }
 881         }
 882
 883         found_msl->version++;
 884
 885         rte_fbarray_set_free(&found_msl->memseg_arr, seg_idx);
 886
 887         ret = free_seg(wa->ms, wa->hi, msl_idx, seg_idx);
 888
 889         if (dir_fd >= 0)
 890                 close(dir_fd);
 891
 892         if (ret < 0)
 893                 return -1;
 894
 895         return 1;
 896 }
 897
 898 int
 899 eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
 900                 int socket, bool exact)
 901 {
 902         int i, ret = -1;
 903 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 904         bool have_numa = false;
 905         int oldpolicy;
 906         struct bitmask *oldmask;
 907 #endif
 908         struct alloc_walk_param wa;
 909         struct hugepage_info *hi = NULL;
 910
 911         memset(&wa, 0, sizeof(wa));
 912
 913         /* dynamic allocation not supported in legacy mode */
 914         if (internal_config.legacy_mem)
 915                 return -1;
 916
 917         for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
 918                 if (page_sz ==
 919                                 internal_config.hugepage_info[i].hugepage_sz) {
 920                         hi = &internal_config.hugepage_info[i];
 921                         break;
 922                 }
 923         }
 924         if (!hi) {
 925                 RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
 926                         __func__);
 927                 return -1;
 928         }
 929
 930 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 931         if (check_numa()) {
 932                 oldmask = numa_allocate_nodemask();
 933                 prepare_numa(&oldpolicy, oldmask, socket);
 934                 have_numa = true;
 935         }
 936 #endif
 937
 938         wa.exact = exact;
 939         wa.hi = hi;
 940         wa.ms = ms;
 941         wa.n_segs = n_segs;
 942         wa.page_sz = page_sz;
 943         wa.socket = socket;
 944         wa.segs_allocated = 0;
 945
 946         /* memalloc is locked, so it's safe to use thread-unsafe version */
 947         ret = rte_memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
 948         if (ret == 0) {
 949                 RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
 950                         __func__);
 951                 ret = -1;
 952         } else if (ret > 0) {
 953                 ret = (int)wa.segs_allocated;
 954         }
 955
 956 #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
 957         if (have_numa)
 958                 restore_numa(&oldpolicy, oldmask);
 959 #endif
 960         return ret;
 961 }
 962
 963 struct rte_memseg *
 964 eal_memalloc_alloc_seg(size_t page_sz, int socket)
 965 {
 966         struct rte_memseg *ms;
 967         if (eal_memalloc_alloc_seg_bulk(&ms, 1, page_sz, socket, true) < 0)
 968                 return NULL;
 969         /* return pointer to newly allocated memseg */
 970         return ms;
 971 }
 972
 973 int
 974 eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
 975 {
 976         int seg, ret = 0;
 977
 978         /* dynamic free not supported in legacy mode */
 979         if (internal_config.legacy_mem)
 980                 return -1;
 981
 982         for (seg = 0; seg < n_segs; seg++) {
 983                 struct rte_memseg *cur = ms[seg];
 984                 struct hugepage_info *hi = NULL;
 985                 struct free_walk_param wa;
 986                 int i, walk_res;
 987
 988                 /* if this page is marked as unfreeable, fail */
 989                 if (cur->flags & RTE_MEMSEG_FLAG_DO_NOT_FREE) {
 990                         RTE_LOG(DEBUG, EAL, "Page is not allowed to be freed\n");
 991                         ret = -1;
 992                         continue;
 993                 }
 994
 995                 memset(&wa, 0, sizeof(wa));
 996
 997                 for (i = 0; i < (int)RTE_DIM(internal_config.hugepage_info);
 998                                 i++) {
 999                         hi = &internal_config.hugepage_info[i];
1000                         if (cur->hugepage_sz == hi->hugepage_sz)
1001                                 break;
1002                 }
1003                 if (i == (int)RTE_DIM(internal_config.hugepage_info)) {
1004                         RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
1005                         ret = -1;
1006                         continue;
1007                 }
1008
1009                 wa.ms = cur;
1010                 wa.hi = hi;
1011
1012                 /* memalloc is locked, so it's safe to use thread-unsafe version
1013                  */
1014                 walk_res = rte_memseg_list_walk_thread_unsafe(free_seg_walk,
1015                                 &wa);
1016                 if (walk_res == 1)
1017                         continue;
1018                 if (walk_res == 0)
1019                         RTE_LOG(ERR, EAL, "Couldn't find memseg list\n");
1020                 ret = -1;
1021         }
1022         return ret;
1023 }
1024
1025 int
1026 eal_memalloc_free_seg(struct rte_memseg *ms)
1027 {
1028         /* dynamic free not supported in legacy mode */
1029         if (internal_config.legacy_mem)
1030                 return -1;
1031
1032         return eal_memalloc_free_seg_bulk(&ms, 1);
1033 }
1034
1035 static int
1036 sync_chunk(struct rte_memseg_list *primary_msl,
1037                 struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1038                 unsigned int msl_idx, bool used, int start, int end)
1039 {
1040         struct rte_fbarray *l_arr, *p_arr;
1041         int i, ret, chunk_len, diff_len;
1042
1043         l_arr = &local_msl->memseg_arr;
1044         p_arr = &primary_msl->memseg_arr;
1045
1046         /* we need to aggregate allocations/deallocations into bigger chunks,
1047          * as we don't want to spam the user with per-page callbacks.
1048          *
1049          * to avoid any potential issues, we also want to trigger
1050          * deallocation callbacks *before* we actually deallocate
1051          * memory, so that the user application could wrap up its use
1052          * before it goes away.
1053          */
1054
1055         chunk_len = end - start;
1056
1057         /* find how many contiguous pages we can map/unmap for this chunk */
1058         diff_len = used ?
1059                         rte_fbarray_find_contig_free(l_arr, start) :
1060                         rte_fbarray_find_contig_used(l_arr, start);
1061
1062         /* has to be at least one page */
1063         if (diff_len < 1)
1064                 return -1;
1065
1066         diff_len = RTE_MIN(chunk_len, diff_len);
1067
1068         /* if we are freeing memory, notify the application */
1069         if (!used) {
1070                 struct rte_memseg *ms;
1071                 void *start_va;
1072                 size_t len, page_sz;
1073
1074                 ms = rte_fbarray_get(l_arr, start);
1075                 start_va = ms->addr;
1076                 page_sz = (size_t)primary_msl->page_sz;
1077                 len = page_sz * diff_len;
1078
1079                 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_FREE,
1080                                 start_va, len);
1081         }
1082
1083         for (i = 0; i < diff_len; i++) {
1084                 struct rte_memseg *p_ms, *l_ms;
1085                 int seg_idx = start + i;
1086
1087                 l_ms = rte_fbarray_get(l_arr, seg_idx);
1088                 p_ms = rte_fbarray_get(p_arr, seg_idx);
1089
1090                 if (l_ms == NULL || p_ms == NULL)
1091                         return -1;
1092
1093                 if (used) {
1094                         ret = alloc_seg(l_ms, p_ms->addr,
1095                                         p_ms->socket_id, hi,
1096                                         msl_idx, seg_idx);
1097                         if (ret < 0)
1098                                 return -1;
1099                         rte_fbarray_set_used(l_arr, seg_idx);
1100                 } else {
1101                         ret = free_seg(l_ms, hi, msl_idx, seg_idx);
1102                         rte_fbarray_set_free(l_arr, seg_idx);
1103                         if (ret < 0)
1104                                 return -1;
1105                 }
1106         }
1107
1108         /* if we just allocated memory, notify the application */
1109         if (used) {
1110                 struct rte_memseg *ms;
1111                 void *start_va;
1112                 size_t len, page_sz;
1113
1114                 ms = rte_fbarray_get(l_arr, start);
1115                 start_va = ms->addr;
1116                 page_sz = (size_t)primary_msl->page_sz;
1117                 len = page_sz * diff_len;
1118
1119                 eal_memalloc_mem_event_notify(RTE_MEM_EVENT_ALLOC,
1120                                 start_va, len);
1121         }
1122
1123         /* calculate how much we can advance until next chunk */
1124         diff_len = used ?
1125                         rte_fbarray_find_contig_used(l_arr, start) :
1126                         rte_fbarray_find_contig_free(l_arr, start);
1127         ret = RTE_MIN(chunk_len, diff_len);
1128
1129         return ret;
1130 }
1131
1132 static int
1133 sync_status(struct rte_memseg_list *primary_msl,
1134                 struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1135                 unsigned int msl_idx, bool used)
1136 {
1137         struct rte_fbarray *l_arr, *p_arr;
1138         int p_idx, l_chunk_len, p_chunk_len, ret;
1139         int start, end;
1140
1141         /* this is a little bit tricky, but the basic idea is - walk both lists
1142          * and spot any places where there are discrepancies. walking both lists
1143          * and noting discrepancies in a single go is a hard problem, so we do
1144          * it in two passes - first we spot any places where allocated segments
1145          * mismatch (i.e. ensure that everything that's allocated in the primary
1146          * is also allocated in the secondary), and then we do it by looking at
1147          * free segments instead.
1148          *
1149          * we also need to aggregate changes into chunks, as we have to call
1150          * callbacks per allocation, not per page.
1151          */
1152         l_arr = &local_msl->memseg_arr;
1153         p_arr = &primary_msl->memseg_arr;
1154
1155         if (used)
1156                 p_idx = rte_fbarray_find_next_used(p_arr, 0);
1157         else
1158                 p_idx = rte_fbarray_find_next_free(p_arr, 0);
1159
1160         while (p_idx >= 0) {
1161                 int next_chunk_search_idx;
1162
1163                 if (used) {
1164                         p_chunk_len = rte_fbarray_find_contig_used(p_arr,
1165                                         p_idx);
1166                         l_chunk_len = rte_fbarray_find_contig_used(l_arr,
1167                                         p_idx);
1168                 } else {
1169                         p_chunk_len = rte_fbarray_find_contig_free(p_arr,
1170                                         p_idx);
1171                         l_chunk_len = rte_fbarray_find_contig_free(l_arr,
1172                                         p_idx);
1173                 }
1174                 /* best case scenario - no differences (or bigger, which will be
1175                  * fixed during next iteration), look for next chunk
1176                  */
1177                 if (l_chunk_len >= p_chunk_len) {
1178                         next_chunk_search_idx = p_idx + p_chunk_len;
1179                         goto next_chunk;
1180                 }
1181
1182                 /* if both chunks start at the same point, skip parts we know
1183                  * are identical, and sync the rest. each call to sync_chunk
1184                  * will only sync contiguous segments, so we need to call this
1185                  * until we are sure there are no more differences in this
1186                  * chunk.
1187                  */
1188                 start = p_idx + l_chunk_len;
1189                 end = p_idx + p_chunk_len;
1190                 do {
1191                         ret = sync_chunk(primary_msl, local_msl, hi, msl_idx,
1192                                         used, start, end);
1193                         start += ret;
1194                 } while (start < end && ret >= 0);
1195                 /* if ret is negative, something went wrong */
1196                 if (ret < 0)
1197                         return -1;
1198
1199                 next_chunk_search_idx = p_idx + p_chunk_len;
1200 next_chunk:
1201                 /* skip to end of this chunk */
1202                 if (used) {
1203                         p_idx = rte_fbarray_find_next_used(p_arr,
1204                                         next_chunk_search_idx);
1205                 } else {
1206                         p_idx = rte_fbarray_find_next_free(p_arr,
1207                                         next_chunk_search_idx);
1208                 }
1209         }
1210         return 0;
1211 }
1212
1213 static int
1214 sync_existing(struct rte_memseg_list *primary_msl,
1215                 struct rte_memseg_list *local_msl, struct hugepage_info *hi,
1216                 unsigned int msl_idx)
1217 {
1218         int ret, dir_fd;
1219
1220         /* do not allow any page allocations during the time we're allocating,
1221          * because file creation and locking operations are not atomic,
1222          * and we might be the first or the last ones to use a particular page,
1223          * so we need to ensure atomicity of every operation.
1224          */
1225         dir_fd = open(hi->hugedir, O_RDONLY);
1226         if (dir_fd < 0) {
1227                 RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n", __func__,
1228                         hi->hugedir, strerror(errno));
1229                 return -1;
1230         }
1231         /* blocking writelock */
1232         if (flock(dir_fd, LOCK_EX)) {
1233                 RTE_LOG(ERR, EAL, "%s(): Cannot lock '%s': %s\n", __func__,
1234                         hi->hugedir, strerror(errno));
1235                 close(dir_fd);
1236                 return -1;
1237         }
1238
1239         /* ensure all allocated space is the same in both lists */
1240         ret = sync_status(primary_msl, local_msl, hi, msl_idx, true);
1241         if (ret < 0)
1242                 goto fail;
1243
1244         /* ensure all unallocated space is the same in both lists */
1245         ret = sync_status(primary_msl, local_msl, hi, msl_idx, false);
1246         if (ret < 0)
1247                 goto fail;
1248
1249         /* update version number */
1250         local_msl->version = primary_msl->version;
1251
1252         close(dir_fd);
1253
1254         return 0;
1255 fail:
1256         close(dir_fd);
1257         return -1;
1258 }
1259
1260 static int
1261 sync_walk(const struct rte_memseg_list *msl, void *arg __rte_unused)
1262 {
1263         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1264         struct rte_memseg_list *primary_msl, *local_msl;
1265         struct hugepage_info *hi = NULL;
1266         unsigned int i;
1267         int msl_idx;
1268
1269         msl_idx = msl - mcfg->memsegs;
1270         primary_msl = &mcfg->memsegs[msl_idx];
1271         local_msl = &local_memsegs[msl_idx];
1272
1273         for (i = 0; i < RTE_DIM(internal_config.hugepage_info); i++) {
1274                 uint64_t cur_sz =
1275                         internal_config.hugepage_info[i].hugepage_sz;
1276                 uint64_t msl_sz = primary_msl->page_sz;
1277                 if (msl_sz == cur_sz) {
1278                         hi = &internal_config.hugepage_info[i];
1279                         break;
1280                 }
1281         }
1282         if (!hi) {
1283                 RTE_LOG(ERR, EAL, "Can't find relevant hugepage_info entry\n");
1284                 return -1;
1285         }
1286
1287         /* if versions don't match, synchronize everything */
1288         if (local_msl->version != primary_msl->version &&
1289                         sync_existing(primary_msl, local_msl, hi, msl_idx))
1290                 return -1;
1291         return 0;
1292 }
1293
1294
1295 int
1296 eal_memalloc_sync_with_primary(void)
1297 {
1298         /* nothing to be done in primary */
1299         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1300                 return 0;
1301
1302         /* memalloc is locked, so it's safe to call thread-unsafe version */
1303         if (rte_memseg_list_walk_thread_unsafe(sync_walk, NULL))
1304                 return -1;
1305         return 0;
1306 }
1307
1308 static int
1309 secondary_msl_create_walk(const struct rte_memseg_list *msl,
1310                 void *arg __rte_unused)
1311 {
1312         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1313         struct rte_memseg_list *primary_msl, *local_msl;
1314         char name[PATH_MAX];
1315         int msl_idx, ret;
1316
1317         msl_idx = msl - mcfg->memsegs;
1318         primary_msl = &mcfg->memsegs[msl_idx];
1319         local_msl = &local_memsegs[msl_idx];
1320
1321         /* create distinct fbarrays for each secondary */
1322         snprintf(name, RTE_FBARRAY_NAME_LEN, "%s_%i",
1323                 primary_msl->memseg_arr.name, getpid());
1324
1325         ret = rte_fbarray_init(&local_msl->memseg_arr, name,
1326                 primary_msl->memseg_arr.len,
1327                 primary_msl->memseg_arr.elt_sz);
1328         if (ret < 0) {
1329                 RTE_LOG(ERR, EAL, "Cannot initialize local memory map\n");
1330                 return -1;
1331         }
1332         local_msl->base_va = primary_msl->base_va;
1333
1334         return 0;
1335 }
1336
1337 static int
1338 alloc_list(int list_idx, int len)
1339 {
1340         int *data;
1341         int i;
1342
1343         /* ensure we have space to store fd per each possible segment */
1344         data = malloc(sizeof(int) * len);
1345         if (data == NULL) {
1346                 RTE_LOG(ERR, EAL, "Unable to allocate space for file descriptors\n");
1347                 return -1;
1348         }
1349         /* set all fd's as invalid */
1350         for (i = 0; i < len; i++)
1351                 data[i] = -1;
1352
1353         fd_list[list_idx].fds = data;
1354         fd_list[list_idx].len = len;
1355         fd_list[list_idx].count = 0;
1356         fd_list[list_idx].memseg_list_fd = -1;
1357
1358         return 0;
1359 }
1360
1361 static int
1362 fd_list_create_walk(const struct rte_memseg_list *msl,
1363                 void *arg __rte_unused)
1364 {
1365         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1366         unsigned int len;
1367         int msl_idx;
1368
1369         msl_idx = msl - mcfg->memsegs;
1370         len = msl->memseg_arr.len;
1371
1372         return alloc_list(msl_idx, len);
1373 }
1374
1375 int
1376 eal_memalloc_set_seg_fd(int list_idx, int seg_idx, int fd)
1377 {
1378         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1379
1380         /* if list is not allocated, allocate it */
1381         if (fd_list[list_idx].len == 0) {
1382                 int len = mcfg->memsegs[list_idx].memseg_arr.len;
1383
1384                 if (alloc_list(list_idx, len) < 0)
1385                         return -ENOMEM;
1386         }
1387         fd_list[list_idx].fds[seg_idx] = fd;
1388
1389         return 0;
1390 }
1391
1392 int
1393 eal_memalloc_get_seg_fd(int list_idx, int seg_idx)
1394 {
1395         int fd;
1396         if (internal_config.single_file_segments) {
1397                 fd = fd_list[list_idx].memseg_list_fd;
1398         } else if (fd_list[list_idx].len == 0) {
1399                 /* list not initialized */
1400                 fd = -1;
1401         } else {
1402                 fd = fd_list[list_idx].fds[seg_idx];
1403         }
1404         if (fd < 0)
1405                 return -ENODEV;
1406         return fd;
1407 }
1408
1409 int
1410 eal_memalloc_init(void)
1411 {
1412         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1413                 if (rte_memseg_list_walk(secondary_msl_create_walk, NULL) < 0)
1414                         return -1;
1415
1416         /* initialize all of the fd lists */
1417         if (rte_memseg_list_walk(fd_list_create_walk, NULL))
1418                 return -1;
1419         return 0;
1420 }