tailq: remove unneeded inclusions
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memory.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 /*   BSD LICENSE
34  *
35  *   Copyright(c) 2013 6WIND.
36  *
37  *   Redistribution and use in source and binary forms, with or without
38  *   modification, are permitted provided that the following conditions
39  *   are met:
40  *
41  *     * Redistributions of source code must retain the above copyright
42  *       notice, this list of conditions and the following disclaimer.
43  *     * Redistributions in binary form must reproduce the above copyright
44  *       notice, this list of conditions and the following disclaimer in
45  *       the documentation and/or other materials provided with the
46  *       distribution.
47  *     * Neither the name of 6WIND S.A. nor the names of its
48  *       contributors may be used to endorse or promote products derived
49  *       from this software without specific prior written permission.
50  *
51  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62  */
63
64 #define _FILE_OFFSET_BITS 64
65 #include <errno.h>
66 #include <stdarg.h>
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <stdint.h>
70 #include <inttypes.h>
71 #include <string.h>
72 #include <stdarg.h>
73 #include <sys/mman.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <sys/queue.h>
77 #include <sys/file.h>
78 #include <unistd.h>
79 #include <limits.h>
80 #include <errno.h>
81 #include <sys/ioctl.h>
82 #include <sys/time.h>
83
84 #include <rte_log.h>
85 #include <rte_memory.h>
86 #include <rte_memzone.h>
87 #include <rte_launch.h>
88 #include <rte_eal.h>
89 #include <rte_eal_memconfig.h>
90 #include <rte_per_lcore.h>
91 #include <rte_lcore.h>
92 #include <rte_common.h>
93 #include <rte_string_fns.h>
94
95 #include "eal_private.h"
96 #include "eal_internal_cfg.h"
97 #include "eal_filesystem.h"
98 #include "eal_hugepages.h"
99
100 /**
101  * @file
102  * Huge page mapping under linux
103  *
104  * To reserve a big contiguous amount of memory, we use the hugepage
105  * feature of linux. For that, we need to have hugetlbfs mounted. This
106  * code will create many files in this directory (one per page) and
107  * map them in virtual memory. For each page, we will retrieve its
108  * physical address and remap it in order to have a virtual contiguous
109  * zone as well as a physical contiguous zone.
110  */
111
112 static uint64_t baseaddr_offset;
113
114 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
115
116 /* Lock page in physical memory and prevent from swapping. */
117 int
118 rte_mem_lock_page(const void *virt)
119 {
120         unsigned long virtual = (unsigned long)virt;
121         int page_size = getpagesize();
122         unsigned long aligned = (virtual & ~ (page_size - 1));
123         return mlock((void*)aligned, page_size);
124 }
125
126 /*
127  * Get physical address of any mapped virtual address in the current process.
128  */
129 phys_addr_t
130 rte_mem_virt2phy(const void *virtaddr)
131 {
132         int fd;
133         uint64_t page, physaddr;
134         unsigned long virt_pfn;
135         int page_size;
136         off_t offset;
137
138         /* standard page size */
139         page_size = getpagesize();
140
141         fd = open("/proc/self/pagemap", O_RDONLY);
142         if (fd < 0) {
143                 RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
144                         __func__, strerror(errno));
145                 return RTE_BAD_PHYS_ADDR;
146         }
147
148         virt_pfn = (unsigned long)virtaddr / page_size;
149         offset = sizeof(uint64_t) * virt_pfn;
150         if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
151                 RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
152                                 __func__, strerror(errno));
153                 close(fd);
154                 return RTE_BAD_PHYS_ADDR;
155         }
156         if (read(fd, &page, sizeof(uint64_t)) < 0) {
157                 RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
158                                 __func__, strerror(errno));
159                 close(fd);
160                 return RTE_BAD_PHYS_ADDR;
161         }
162
163         /*
164          * the pfn (page frame number) are bits 0-54 (see
165          * pagemap.txt in linux Documentation)
166          */
167         physaddr = ((page & 0x7fffffffffffffULL) * page_size)
168                 + ((unsigned long)virtaddr % page_size);
169         close(fd);
170         return physaddr;
171 }
172
173 /*
174  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
175  * it by browsing the /proc/self/pagemap special file.
176  */
177 static int
178 find_physaddrs(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
179 {
180         unsigned i;
181         phys_addr_t addr;
182
183         for (i = 0; i < hpi->num_pages[0]; i++) {
184                 addr = rte_mem_virt2phy(hugepg_tbl[i].orig_va);
185                 if (addr == RTE_BAD_PHYS_ADDR)
186                         return -1;
187                 hugepg_tbl[i].physaddr = addr;
188         }
189         return 0;
190 }
191
192 /*
193  * Check whether address-space layout randomization is enabled in
194  * the kernel. This is important for multi-process as it can prevent
195  * two processes mapping data to the same virtual address
196  * Returns:
197  *    0 - address space randomization disabled
198  *    1/2 - address space randomization enabled
199  *    negative error code on error
200  */
201 static int
202 aslr_enabled(void)
203 {
204         char c;
205         int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
206         if (fd < 0)
207                 return -errno;
208         retval = read(fd, &c, 1);
209         close(fd);
210         if (retval < 0)
211                 return -errno;
212         if (retval == 0)
213                 return -EIO;
214         switch (c) {
215                 case '0' : return 0;
216                 case '1' : return 1;
217                 case '2' : return 2;
218                 default: return -EINVAL;
219         }
220 }
221
222 /*
223  * Try to mmap *size bytes in /dev/zero. If it is successful, return the
224  * pointer to the mmap'd area and keep *size unmodified. Else, retry
225  * with a smaller zone: decrease *size by hugepage_sz until it reaches
226  * 0. In this case, return NULL. Note: this function returns an address
227  * which is a multiple of hugepage size.
228  */
229 static void *
230 get_virtual_area(size_t *size, size_t hugepage_sz)
231 {
232         void *addr;
233         int fd;
234         long aligned_addr;
235
236         if (internal_config.base_virtaddr != 0) {
237                 addr = (void*) (uintptr_t) (internal_config.base_virtaddr +
238                                 baseaddr_offset);
239         }
240         else addr = NULL;
241
242         RTE_LOG(INFO, EAL, "Ask a virtual area of 0x%zx bytes\n", *size);
243
244         fd = open("/dev/zero", O_RDONLY);
245         if (fd < 0){
246                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
247                 return NULL;
248         }
249         do {
250                 addr = mmap(addr,
251                                 (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
252                 if (addr == MAP_FAILED)
253                         *size -= hugepage_sz;
254         } while (addr == MAP_FAILED && *size > 0);
255
256         if (addr == MAP_FAILED) {
257                 close(fd);
258                 RTE_LOG(INFO, EAL, "Cannot get a virtual area\n");
259                 return NULL;
260         }
261
262         munmap(addr, (*size) + hugepage_sz);
263         close(fd);
264
265         /* align addr to a huge page size boundary */
266         aligned_addr = (long)addr;
267         aligned_addr += (hugepage_sz - 1);
268         aligned_addr &= (~(hugepage_sz - 1));
269         addr = (void *)(aligned_addr);
270
271         RTE_LOG(INFO, EAL, "Virtual area found at %p (size = 0x%zx)\n",
272                 addr, *size);
273
274         /* increment offset */
275         baseaddr_offset += *size;
276
277         return addr;
278 }
279
280 /*
281  * Mmap all hugepages of hugepage table: it first open a file in
282  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
283  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
284  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
285  * map continguous physical blocks in contiguous virtual blocks.
286  */
287 static int
288 map_all_hugepages(struct hugepage_file *hugepg_tbl,
289                 struct hugepage_info *hpi, int orig)
290 {
291         int fd;
292         unsigned i;
293         void *virtaddr;
294         void *vma_addr = NULL;
295         size_t vma_len = 0;
296
297 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
298         RTE_SET_USED(vma_len);
299 #endif
300
301         for (i = 0; i < hpi->num_pages[0]; i++) {
302                 uint64_t hugepage_sz = hpi->hugepage_sz;
303
304                 if (orig) {
305                         hugepg_tbl[i].file_id = i;
306                         hugepg_tbl[i].size = hugepage_sz;
307 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
308                         eal_get_hugefile_temp_path(hugepg_tbl[i].filepath,
309                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
310                                         hugepg_tbl[i].file_id);
311 #else
312                         eal_get_hugefile_path(hugepg_tbl[i].filepath,
313                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
314                                         hugepg_tbl[i].file_id);
315 #endif
316                         hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
317                 }
318 #ifndef RTE_ARCH_64
319                 /* for 32-bit systems, don't remap 1G and 16G pages, just reuse
320                  * original map address as final map address.
321                  */
322                 else if ((hugepage_sz == RTE_PGSIZE_1G)
323                         || (hugepage_sz == RTE_PGSIZE_16G)) {
324                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
325                         hugepg_tbl[i].orig_va = NULL;
326                         continue;
327                 }
328 #endif
329
330 #ifndef RTE_EAL_SINGLE_FILE_SEGMENTS
331                 else if (vma_len == 0) {
332                         unsigned j, num_pages;
333
334                         /* reserve a virtual area for next contiguous
335                          * physical block: count the number of
336                          * contiguous physical pages. */
337                         for (j = i+1; j < hpi->num_pages[0] ; j++) {
338 #ifdef RTE_ARCH_PPC_64
339                                 /* The physical addresses are sorted in
340                                  * descending order on PPC64 */
341                                 if (hugepg_tbl[j].physaddr !=
342                                     hugepg_tbl[j-1].physaddr - hugepage_sz)
343                                         break;
344 #else
345                                 if (hugepg_tbl[j].physaddr !=
346                                     hugepg_tbl[j-1].physaddr + hugepage_sz)
347                                         break;
348 #endif
349                         }
350                         num_pages = j - i;
351                         vma_len = num_pages * hugepage_sz;
352
353                         /* get the biggest virtual memory area up to
354                          * vma_len. If it fails, vma_addr is NULL, so
355                          * let the kernel provide the address. */
356                         vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
357                         if (vma_addr == NULL)
358                                 vma_len = hugepage_sz;
359                 }
360 #endif
361
362                 /* try to create hugepage file */
363                 fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
364                 if (fd < 0) {
365                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
366                                         strerror(errno));
367                         return -1;
368                 }
369
370                 virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
371                                 MAP_SHARED, fd, 0);
372                 if (virtaddr == MAP_FAILED) {
373                         RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
374                                         strerror(errno));
375                         close(fd);
376                         return -1;
377                 }
378
379                 if (orig) {
380                         hugepg_tbl[i].orig_va = virtaddr;
381                         memset(virtaddr, 0, hugepage_sz);
382                 }
383                 else {
384                         hugepg_tbl[i].final_va = virtaddr;
385                 }
386
387                 /* set shared flock on the file. */
388                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
389                         RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
390                                 __func__, strerror(errno));
391                         close(fd);
392                         return -1;
393                 }
394
395                 close(fd);
396
397                 vma_addr = (char *)vma_addr + hugepage_sz;
398                 vma_len -= hugepage_sz;
399         }
400         return 0;
401 }
402
403 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
404
405 /*
406  * Remaps all hugepages into single file segments
407  */
408 static int
409 remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
410 {
411         int fd;
412         unsigned i = 0, j, num_pages, page_idx = 0;
413         void *vma_addr = NULL, *old_addr = NULL, *page_addr = NULL;
414         size_t vma_len = 0;
415         size_t hugepage_sz = hpi->hugepage_sz;
416         size_t total_size, offset;
417         char filepath[MAX_HUGEPAGE_PATH];
418         phys_addr_t physaddr;
419         int socket;
420
421         while (i < hpi->num_pages[0]) {
422
423 #ifndef RTE_ARCH_64
424                 /* for 32-bit systems, don't remap 1G pages and 16G pages,
425                  * just reuse original map address as final map address.
426                  */
427                 if ((hugepage_sz == RTE_PGSIZE_1G)
428                         || (hugepage_sz == RTE_PGSIZE_16G)) {
429                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
430                         hugepg_tbl[i].orig_va = NULL;
431                         i++;
432                         continue;
433                 }
434 #endif
435
436                 /* reserve a virtual area for next contiguous
437                  * physical block: count the number of
438                  * contiguous physical pages. */
439                 for (j = i+1; j < hpi->num_pages[0] ; j++) {
440 #ifdef RTE_ARCH_PPC_64
441                         /* The physical addresses are sorted in descending
442                          * order on PPC64 */
443                         if (hugepg_tbl[j].physaddr !=
444                                 hugepg_tbl[j-1].physaddr - hugepage_sz)
445                                 break;
446 #else
447                         if (hugepg_tbl[j].physaddr !=
448                                 hugepg_tbl[j-1].physaddr + hugepage_sz)
449                                 break;
450 #endif
451                 }
452                 num_pages = j - i;
453                 vma_len = num_pages * hugepage_sz;
454
455                 socket = hugepg_tbl[i].socket_id;
456
457                 /* get the biggest virtual memory area up to
458                  * vma_len. If it fails, vma_addr is NULL, so
459                  * let the kernel provide the address. */
460                 vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
461
462                 /* If we can't find a big enough virtual area, work out how many pages
463                  * we are going to get */
464                 if (vma_addr == NULL)
465                         j = i + 1;
466                 else if (vma_len != num_pages * hugepage_sz) {
467                         num_pages = vma_len / hugepage_sz;
468                         j = i + num_pages;
469
470                 }
471
472                 hugepg_tbl[page_idx].file_id = page_idx;
473                 eal_get_hugefile_path(filepath,
474                                 sizeof(filepath),
475                                 hpi->hugedir,
476                                 hugepg_tbl[page_idx].file_id);
477
478                 /* try to create hugepage file */
479                 fd = open(filepath, O_CREAT | O_RDWR, 0755);
480                 if (fd < 0) {
481                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__, strerror(errno));
482                         return -1;
483                 }
484
485                 total_size = 0;
486                 for (;i < j; i++) {
487
488                         /* unmap current segment */
489                         if (total_size > 0)
490                                 munmap(vma_addr, total_size);
491
492                         /* unmap original page */
493                         munmap(hugepg_tbl[i].orig_va, hugepage_sz);
494                         unlink(hugepg_tbl[i].filepath);
495
496                         total_size += hugepage_sz;
497
498                         old_addr = vma_addr;
499
500                         /* map new, bigger segment */
501                         vma_addr = mmap(vma_addr, total_size,
502                                         PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
503
504                         if (vma_addr == MAP_FAILED || vma_addr != old_addr) {
505                                 RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno));
506                                 close(fd);
507                                 return -1;
508                         }
509
510                         /* touch the page. this is needed because kernel postpones mapping
511                          * creation until the first page fault. with this, we pin down
512                          * the page and it is marked as used and gets into process' pagemap.
513                          */
514                         for (offset = 0; offset < total_size; offset += hugepage_sz)
515                                 *((volatile uint8_t*) RTE_PTR_ADD(vma_addr, offset));
516                 }
517
518                 /* set shared flock on the file. */
519                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
520                         RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
521                                 __func__, strerror(errno));
522                         close(fd);
523                         return -1;
524                 }
525
526                 snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s",
527                                 filepath);
528
529                 physaddr = rte_mem_virt2phy(vma_addr);
530
531                 if (physaddr == RTE_BAD_PHYS_ADDR)
532                         return -1;
533
534                 hugepg_tbl[page_idx].final_va = vma_addr;
535
536                 hugepg_tbl[page_idx].physaddr = physaddr;
537
538                 hugepg_tbl[page_idx].repeated = num_pages;
539
540                 hugepg_tbl[page_idx].socket_id = socket;
541
542                 close(fd);
543
544                 /* verify the memory segment - that is, check that every VA corresponds
545                  * to the physical address we expect to see
546                  */
547                 for (offset = 0; offset < vma_len; offset += hugepage_sz) {
548                         uint64_t expected_physaddr;
549
550                         expected_physaddr = hugepg_tbl[page_idx].physaddr + offset;
551                         page_addr = RTE_PTR_ADD(vma_addr, offset);
552                         physaddr = rte_mem_virt2phy(page_addr);
553
554                         if (physaddr != expected_physaddr) {
555                                 RTE_LOG(ERR, EAL, "Segment sanity check failed: wrong physaddr "
556                                                 "at %p (offset 0x%" PRIx64 ": 0x%" PRIx64
557                                                 " (expected 0x%" PRIx64 ")\n",
558                                                 page_addr, offset, physaddr, expected_physaddr);
559                                 return -1;
560                         }
561                 }
562
563                 /* zero out the whole segment */
564                 memset(hugepg_tbl[page_idx].final_va, 0, total_size);
565
566                 page_idx++;
567         }
568
569         /* zero out the rest */
570         memset(&hugepg_tbl[page_idx], 0, (hpi->num_pages[0] - page_idx) * sizeof(struct hugepage_file));
571         return page_idx;
572 }
573 #else/* RTE_EAL_SINGLE_FILE_SEGMENTS=n */
574
575 /* Unmap all hugepages from original mapping */
576 static int
577 unmap_all_hugepages_orig(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
578 {
579         unsigned i;
580         for (i = 0; i < hpi->num_pages[0]; i++) {
581                 if (hugepg_tbl[i].orig_va) {
582                         munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
583                         hugepg_tbl[i].orig_va = NULL;
584                 }
585         }
586         return 0;
587 }
588 #endif /* RTE_EAL_SINGLE_FILE_SEGMENTS */
589
590 /*
591  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
592  * page.
593  */
594 static int
595 find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
596 {
597         int socket_id;
598         char *end, *nodestr;
599         unsigned i, hp_count = 0;
600         uint64_t virt_addr;
601         char buf[BUFSIZ];
602         char hugedir_str[PATH_MAX];
603         FILE *f;
604
605         f = fopen("/proc/self/numa_maps", "r");
606         if (f == NULL) {
607                 RTE_LOG(INFO, EAL, "cannot open /proc/self/numa_maps,"
608                                 " consider that all memory is in socket_id 0\n");
609                 return 0;
610         }
611
612         snprintf(hugedir_str, sizeof(hugedir_str),
613                         "%s/%s", hpi->hugedir, internal_config.hugefile_prefix);
614
615         /* parse numa map */
616         while (fgets(buf, sizeof(buf), f) != NULL) {
617
618                 /* ignore non huge page */
619                 if (strstr(buf, " huge ") == NULL &&
620                                 strstr(buf, hugedir_str) == NULL)
621                         continue;
622
623                 /* get zone addr */
624                 virt_addr = strtoull(buf, &end, 16);
625                 if (virt_addr == 0 || end == buf) {
626                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
627                         goto error;
628                 }
629
630                 /* get node id (socket id) */
631                 nodestr = strstr(buf, " N");
632                 if (nodestr == NULL) {
633                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
634                         goto error;
635                 }
636                 nodestr += 2;
637                 end = strstr(nodestr, "=");
638                 if (end == NULL) {
639                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
640                         goto error;
641                 }
642                 end[0] = '\0';
643                 end = NULL;
644
645                 socket_id = strtoul(nodestr, &end, 0);
646                 if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
647                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
648                         goto error;
649                 }
650
651                 /* if we find this page in our mappings, set socket_id */
652                 for (i = 0; i < hpi->num_pages[0]; i++) {
653                         void *va = (void *)(unsigned long)virt_addr;
654                         if (hugepg_tbl[i].orig_va == va) {
655                                 hugepg_tbl[i].socket_id = socket_id;
656                                 hp_count++;
657                         }
658                 }
659         }
660
661         if (hp_count < hpi->num_pages[0])
662                 goto error;
663
664         fclose(f);
665         return 0;
666
667 error:
668         fclose(f);
669         return -1;
670 }
671
672 /*
673  * Sort the hugepg_tbl by physical address (lower addresses first on x86,
674  * higher address first on powerpc). We use a slow algorithm, but we won't
675  * have millions of pages, and this is only done at init time.
676  */
677 static int
678 sort_by_physaddr(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
679 {
680         unsigned i, j;
681         int compare_idx;
682         uint64_t compare_addr;
683         struct hugepage_file tmp;
684
685         for (i = 0; i < hpi->num_pages[0]; i++) {
686                 compare_addr = 0;
687                 compare_idx = -1;
688
689                 /*
690                  * browse all entries starting at 'i', and find the
691                  * entry with the smallest addr
692                  */
693                 for (j=i; j< hpi->num_pages[0]; j++) {
694
695                         if (compare_addr == 0 ||
696 #ifdef RTE_ARCH_PPC_64
697                                 hugepg_tbl[j].physaddr > compare_addr) {
698 #else
699                                 hugepg_tbl[j].physaddr < compare_addr) {
700 #endif
701                                 compare_addr = hugepg_tbl[j].physaddr;
702                                 compare_idx = j;
703                         }
704                 }
705
706                 /* should not happen */
707                 if (compare_idx == -1) {
708                         RTE_LOG(ERR, EAL, "%s(): error in physaddr sorting\n", __func__);
709                         return -1;
710                 }
711
712                 /* swap the 2 entries in the table */
713                 memcpy(&tmp, &hugepg_tbl[compare_idx],
714                         sizeof(struct hugepage_file));
715                 memcpy(&hugepg_tbl[compare_idx], &hugepg_tbl[i],
716                         sizeof(struct hugepage_file));
717                 memcpy(&hugepg_tbl[i], &tmp, sizeof(struct hugepage_file));
718         }
719         return 0;
720 }
721
722 /*
723  * Uses mmap to create a shared memory area for storage of data
724  * Used in this file to store the hugepage file map on disk
725  */
726 static void *
727 create_shared_memory(const char *filename, const size_t mem_size)
728 {
729         void *retval;
730         int fd = open(filename, O_CREAT | O_RDWR, 0666);
731         if (fd < 0)
732                 return NULL;
733         if (ftruncate(fd, mem_size) < 0) {
734                 close(fd);
735                 return NULL;
736         }
737         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
738         close(fd);
739         return retval;
740 }
741
742 /*
743  * this copies *active* hugepages from one hugepage table to another.
744  * destination is typically the shared memory.
745  */
746 static int
747 copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size,
748                 const struct hugepage_file * src, int src_size)
749 {
750         int src_pos, dst_pos = 0;
751
752         for (src_pos = 0; src_pos < src_size; src_pos++) {
753                 if (src[src_pos].final_va != NULL) {
754                         /* error on overflow attempt */
755                         if (dst_pos == dest_size)
756                                 return -1;
757                         memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage_file));
758                         dst_pos++;
759                 }
760         }
761         return 0;
762 }
763
764 /*
765  * unmaps hugepages that are not going to be used. since we originally allocate
766  * ALL hugepages (not just those we need), additional unmapping needs to be done.
767  */
768 static int
769 unmap_unneeded_hugepages(struct hugepage_file *hugepg_tbl,
770                 struct hugepage_info *hpi,
771                 unsigned num_hp_info)
772 {
773         unsigned socket, size;
774         int page, nrpages = 0;
775
776         /* get total number of hugepages */
777         for (size = 0; size < num_hp_info; size++)
778                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
779                         nrpages += internal_config.hugepage_info[size].num_pages[socket];
780
781         for (size = 0; size < num_hp_info; size++) {
782                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
783                         unsigned pages_found = 0;
784
785                         /* traverse until we have unmapped all the unused pages */
786                         for (page = 0; page < nrpages; page++) {
787                                 struct hugepage_file *hp = &hugepg_tbl[page];
788
789 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
790                                 /* if this page was already cleared */
791                                 if (hp->final_va == NULL)
792                                         continue;
793 #endif
794
795                                 /* find a page that matches the criteria */
796                                 if ((hp->size == hpi[size].hugepage_sz) &&
797                                                 (hp->socket_id == (int) socket)) {
798
799                                         /* if we skipped enough pages, unmap the rest */
800                                         if (pages_found == hpi[size].num_pages[socket]) {
801                                                 uint64_t unmap_len;
802
803 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
804                                                 unmap_len = hp->size * hp->repeated;
805 #else
806                                                 unmap_len = hp->size;
807 #endif
808
809                                                 /* get start addr and len of the remaining segment */
810                                                 munmap(hp->final_va, (size_t) unmap_len);
811
812                                                 hp->final_va = NULL;
813                                                 if (unlink(hp->filepath) == -1) {
814                                                         RTE_LOG(ERR, EAL, "%s(): Removing %s failed: %s\n",
815                                                                         __func__, hp->filepath, strerror(errno));
816                                                         return -1;
817                                                 }
818                                         }
819 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
820                                         /* else, check how much do we need to map */
821                                         else {
822                                                 int nr_pg_left =
823                                                                 hpi[size].num_pages[socket] - pages_found;
824
825                                                 /* if we need enough memory to fit into the segment */
826                                                 if (hp->repeated <= nr_pg_left) {
827                                                         pages_found += hp->repeated;
828                                                 }
829                                                 /* truncate the segment */
830                                                 else {
831                                                         uint64_t final_size = nr_pg_left * hp->size;
832                                                         uint64_t seg_size = hp->repeated * hp->size;
833
834                                                         void * unmap_va = RTE_PTR_ADD(hp->final_va,
835                                                                         final_size);
836                                                         int fd;
837
838                                                         munmap(unmap_va, seg_size - final_size);
839
840                                                         fd = open(hp->filepath, O_RDWR);
841                                                         if (fd < 0) {
842                                                                 RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
843                                                                                 hp->filepath, strerror(errno));
844                                                                 return -1;
845                                                         }
846                                                         if (ftruncate(fd, final_size) < 0) {
847                                                                 RTE_LOG(ERR, EAL, "Cannot truncate %s: %s\n",
848                                                                                 hp->filepath, strerror(errno));
849                                                                 return -1;
850                                                         }
851                                                         close(fd);
852
853                                                         pages_found += nr_pg_left;
854                                                         hp->repeated = nr_pg_left;
855                                                 }
856                                         }
857 #else
858                                         /* else, lock the page and skip */
859                                         else
860                                                 pages_found++;
861 #endif
862
863                                 } /* match page */
864                         } /* foreach page */
865                 } /* foreach socket */
866         } /* foreach pagesize */
867
868         return 0;
869 }
870
871 static inline uint64_t
872 get_socket_mem_size(int socket)
873 {
874         uint64_t size = 0;
875         unsigned i;
876
877         for (i = 0; i < internal_config.num_hugepage_sizes; i++){
878                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
879                 if (hpi->hugedir != NULL)
880                         size += hpi->hugepage_sz * hpi->num_pages[socket];
881         }
882
883         return (size);
884 }
885
886 /*
887  * This function is a NUMA-aware equivalent of calc_num_pages.
888  * It takes in the list of hugepage sizes and the
889  * number of pages thereof, and calculates the best number of
890  * pages of each size to fulfill the request for <memory> ram
891  */
892 static int
893 calc_num_pages_per_socket(uint64_t * memory,
894                 struct hugepage_info *hp_info,
895                 struct hugepage_info *hp_used,
896                 unsigned num_hp_info)
897 {
898         unsigned socket, j, i = 0;
899         unsigned requested, available;
900         int total_num_pages = 0;
901         uint64_t remaining_mem, cur_mem;
902         uint64_t total_mem = internal_config.memory;
903
904         if (num_hp_info == 0)
905                 return -1;
906
907         /* if specific memory amounts per socket weren't requested */
908         if (internal_config.force_sockets == 0) {
909                 int cpu_per_socket[RTE_MAX_NUMA_NODES];
910                 size_t default_size, total_size;
911                 unsigned lcore_id;
912
913                 /* Compute number of cores per socket */
914                 memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
915                 RTE_LCORE_FOREACH(lcore_id) {
916                         cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
917                 }
918
919                 /*
920                  * Automatically spread requested memory amongst detected sockets according
921                  * to number of cores from cpu mask present on each socket
922                  */
923                 total_size = internal_config.memory;
924                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
925
926                         /* Set memory amount per socket */
927                         default_size = (internal_config.memory * cpu_per_socket[socket])
928                                         / rte_lcore_count();
929
930                         /* Limit to maximum available memory on socket */
931                         default_size = RTE_MIN(default_size, get_socket_mem_size(socket));
932
933                         /* Update sizes */
934                         memory[socket] = default_size;
935                         total_size -= default_size;
936                 }
937
938                 /*
939                  * If some memory is remaining, try to allocate it by getting all
940                  * available memory from sockets, one after the other
941                  */
942                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0; socket++) {
943                         /* take whatever is available */
944                         default_size = RTE_MIN(get_socket_mem_size(socket) - memory[socket],
945                                                total_size);
946
947                         /* Update sizes */
948                         memory[socket] += default_size;
949                         total_size -= default_size;
950                 }
951         }
952
953         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
954                 /* skips if the memory on specific socket wasn't requested */
955                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
956                         hp_used[i].hugedir = hp_info[i].hugedir;
957                         hp_used[i].num_pages[socket] = RTE_MIN(
958                                         memory[socket] / hp_info[i].hugepage_sz,
959                                         hp_info[i].num_pages[socket]);
960
961                         cur_mem = hp_used[i].num_pages[socket] *
962                                         hp_used[i].hugepage_sz;
963
964                         memory[socket] -= cur_mem;
965                         total_mem -= cur_mem;
966
967                         total_num_pages += hp_used[i].num_pages[socket];
968
969                         /* check if we have met all memory requests */
970                         if (memory[socket] == 0)
971                                 break;
972
973                         /* check if we have any more pages left at this size, if so
974                          * move on to next size */
975                         if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
976                                 continue;
977                         /* At this point we know that there are more pages available that are
978                          * bigger than the memory we want, so lets see if we can get enough
979                          * from other page sizes.
980                          */
981                         remaining_mem = 0;
982                         for (j = i+1; j < num_hp_info; j++)
983                                 remaining_mem += hp_info[j].hugepage_sz *
984                                 hp_info[j].num_pages[socket];
985
986                         /* is there enough other memory, if not allocate another page and quit */
987                         if (remaining_mem < memory[socket]){
988                                 cur_mem = RTE_MIN(memory[socket],
989                                                 hp_info[i].hugepage_sz);
990                                 memory[socket] -= cur_mem;
991                                 total_mem -= cur_mem;
992                                 hp_used[i].num_pages[socket]++;
993                                 total_num_pages++;
994                                 break; /* we are done with this socket*/
995                         }
996                 }
997                 /* if we didn't satisfy all memory requirements per socket */
998                 if (memory[socket] > 0) {
999                         /* to prevent icc errors */
1000                         requested = (unsigned) (internal_config.socket_mem[socket] /
1001                                         0x100000);
1002                         available = requested -
1003                                         ((unsigned) (memory[socket] / 0x100000));
1004                         RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! "
1005                                         "Requested: %uMB, available: %uMB\n", socket,
1006                                         requested, available);
1007                         return -1;
1008                 }
1009         }
1010
1011         /* if we didn't satisfy total memory requirements */
1012         if (total_mem > 0) {
1013                 requested = (unsigned) (internal_config.memory / 0x100000);
1014                 available = requested - (unsigned) (total_mem / 0x100000);
1015                 RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB,"
1016                                 " available: %uMB\n", requested, available);
1017                 return -1;
1018         }
1019         return total_num_pages;
1020 }
1021
1022 /*
1023  * Prepare physical memory mapping: fill configuration structure with
1024  * these infos, return 0 on success.
1025  *  1. map N huge pages in separate files in hugetlbfs
1026  *  2. find associated physical addr
1027  *  3. find associated NUMA socket ID
1028  *  4. sort all huge pages by physical address
1029  *  5. remap these N huge pages in the correct order
1030  *  6. unmap the first mapping
1031  *  7. fill memsegs in configuration with contiguous zones
1032  */
1033 static int
1034 rte_eal_hugepage_init(void)
1035 {
1036         struct rte_mem_config *mcfg;
1037         struct hugepage_file *hugepage, *tmp_hp = NULL;
1038         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
1039
1040         uint64_t memory[RTE_MAX_NUMA_NODES];
1041
1042         unsigned hp_offset;
1043         int i, j, new_memseg;
1044         int nr_hugefiles, nr_hugepages = 0;
1045         void *addr;
1046 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1047         int new_pages_count[MAX_HUGEPAGE_SIZES];
1048 #endif
1049
1050         memset(used_hp, 0, sizeof(used_hp));
1051
1052         /* get pointer to global configuration */
1053         mcfg = rte_eal_get_configuration()->mem_config;
1054
1055         /* hugetlbfs can be disabled */
1056         if (internal_config.no_hugetlbfs) {
1057                 addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
1058                                 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
1059                 if (addr == MAP_FAILED) {
1060                         RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__,
1061                                         strerror(errno));
1062                         return -1;
1063                 }
1064                 mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
1065                 mcfg->memseg[0].addr = addr;
1066                 mcfg->memseg[0].len = internal_config.memory;
1067                 mcfg->memseg[0].socket_id = SOCKET_ID_ANY;
1068                 return 0;
1069         }
1070
1071 /* check if app runs on Xen Dom0 */
1072         if (internal_config.xen_dom0_support) {
1073 #ifdef RTE_LIBRTE_XEN_DOM0
1074                 /* use dom0_mm kernel driver to init memory */
1075                 if (rte_xen_dom0_memory_init() < 0)
1076                         return -1;
1077                 else
1078                         return 0;
1079 #endif
1080         }
1081
1082
1083         /* calculate total number of hugepages available. at this point we haven't
1084          * yet started sorting them so they all are on socket 0 */
1085         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1086                 /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
1087                 used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
1088
1089                 nr_hugepages += internal_config.hugepage_info[i].num_pages[0];
1090         }
1091
1092         /*
1093          * allocate a memory area for hugepage table.
1094          * this isn't shared memory yet. due to the fact that we need some
1095          * processing done on these pages, shared memory will be created
1096          * at a later stage.
1097          */
1098         tmp_hp = malloc(nr_hugepages * sizeof(struct hugepage_file));
1099         if (tmp_hp == NULL)
1100                 goto fail;
1101
1102         memset(tmp_hp, 0, nr_hugepages * sizeof(struct hugepage_file));
1103
1104         hp_offset = 0; /* where we start the current page size entries */
1105
1106         /* map all hugepages and sort them */
1107         for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
1108                 struct hugepage_info *hpi;
1109
1110                 /*
1111                  * we don't yet mark hugepages as used at this stage, so
1112                  * we just map all hugepages available to the system
1113                  * all hugepages are still located on socket 0
1114                  */
1115                 hpi = &internal_config.hugepage_info[i];
1116
1117                 if (hpi->num_pages[0] == 0)
1118                         continue;
1119
1120                 /* map all hugepages available */
1121                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
1122                         RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
1123                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1124                         goto fail;
1125                 }
1126
1127                 /* find physical addresses and sockets for each hugepage */
1128                 if (find_physaddrs(&tmp_hp[hp_offset], hpi) < 0){
1129                         RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
1130                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1131                         goto fail;
1132                 }
1133
1134                 if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
1135                         RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
1136                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1137                         goto fail;
1138                 }
1139
1140                 if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)
1141                         goto fail;
1142
1143 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1144                 /* remap all hugepages into single file segments */
1145                 new_pages_count[i] = remap_all_hugepages(&tmp_hp[hp_offset], hpi);
1146                 if (new_pages_count[i] < 0){
1147                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
1148                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1149                         goto fail;
1150                 }
1151
1152                 /* we have processed a num of hugepages of this size, so inc offset */
1153                 hp_offset += new_pages_count[i];
1154 #else
1155                 /* remap all hugepages */
1156                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
1157                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
1158                                         (unsigned)(hpi->hugepage_sz / 0x100000));
1159                         goto fail;
1160                 }
1161
1162                 /* unmap original mappings */
1163                 if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
1164                         goto fail;
1165
1166                 /* we have processed a num of hugepages of this size, so inc offset */
1167                 hp_offset += hpi->num_pages[0];
1168 #endif
1169         }
1170
1171 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1172         nr_hugefiles = 0;
1173         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1174                 nr_hugefiles += new_pages_count[i];
1175         }
1176 #else
1177         nr_hugefiles = nr_hugepages;
1178 #endif
1179
1180
1181         /* clean out the numbers of pages */
1182         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
1183                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
1184                         internal_config.hugepage_info[i].num_pages[j] = 0;
1185
1186         /* get hugepages for each socket */
1187         for (i = 0; i < nr_hugefiles; i++) {
1188                 int socket = tmp_hp[i].socket_id;
1189
1190                 /* find a hugepage info with right size and increment num_pages */
1191                 for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) {
1192                         if (tmp_hp[i].size ==
1193                                         internal_config.hugepage_info[j].hugepage_sz) {
1194 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1195                                         internal_config.hugepage_info[j].num_pages[socket] +=
1196                                                 tmp_hp[i].repeated;
1197 #else
1198                                 internal_config.hugepage_info[j].num_pages[socket]++;
1199 #endif
1200                         }
1201                 }
1202         }
1203
1204         /* make a copy of socket_mem, needed for number of pages calculation */
1205         for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
1206                 memory[i] = internal_config.socket_mem[i];
1207
1208         /* calculate final number of pages */
1209         nr_hugepages = calc_num_pages_per_socket(memory,
1210                         internal_config.hugepage_info, used_hp,
1211                         internal_config.num_hugepage_sizes);
1212
1213         /* error if not enough memory available */
1214         if (nr_hugepages < 0)
1215                 goto fail;
1216
1217         /* reporting in! */
1218         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
1219                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
1220                         if (used_hp[i].num_pages[j] > 0) {
1221                                 RTE_LOG(INFO, EAL,
1222                                                 "Requesting %u pages of size %uMB"
1223                                                 " from socket %i\n",
1224                                                 used_hp[i].num_pages[j],
1225                                                 (unsigned)
1226                                                         (used_hp[i].hugepage_sz / 0x100000),
1227                                                 j);
1228                         }
1229                 }
1230         }
1231
1232         /* create shared memory */
1233         hugepage = create_shared_memory(eal_hugepage_info_path(),
1234                         nr_hugefiles * sizeof(struct hugepage_file));
1235
1236         if (hugepage == NULL) {
1237                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
1238                 goto fail;
1239         }
1240         memset(hugepage, 0, nr_hugefiles * sizeof(struct hugepage_file));
1241
1242         /*
1243          * unmap pages that we won't need (looks at used_hp).
1244          * also, sets final_va to NULL on pages that were unmapped.
1245          */
1246         if (unmap_unneeded_hugepages(tmp_hp, used_hp,
1247                         internal_config.num_hugepage_sizes) < 0) {
1248                 RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
1249                 goto fail;
1250         }
1251
1252         /*
1253          * copy stuff from malloc'd hugepage* to the actual shared memory.
1254          * this procedure only copies those hugepages that have final_va
1255          * not NULL. has overflow protection.
1256          */
1257         if (copy_hugepages_to_shared_mem(hugepage, nr_hugefiles,
1258                         tmp_hp, nr_hugefiles) < 0) {
1259                 RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
1260                 goto fail;
1261         }
1262
1263         /* free the temporary hugepage table */
1264         free(tmp_hp);
1265         tmp_hp = NULL;
1266
1267         /* find earliest free memseg - this is needed because in case of IVSHMEM,
1268          * segments might have already been initialized */
1269         for (j = 0; j < RTE_MAX_MEMSEG; j++)
1270                 if (mcfg->memseg[j].addr == NULL) {
1271                         /* move to previous segment and exit loop */
1272                         j--;
1273                         break;
1274                 }
1275
1276         for (i = 0; i < nr_hugefiles; i++) {
1277                 new_memseg = 0;
1278
1279                 /* if this is a new section, create a new memseg */
1280                 if (i == 0)
1281                         new_memseg = 1;
1282                 else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
1283                         new_memseg = 1;
1284                 else if (hugepage[i].size != hugepage[i-1].size)
1285                         new_memseg = 1;
1286
1287 #ifdef RTE_ARCH_PPC_64
1288                 /* On PPC64 architecture, the mmap always start from higher
1289                  * virtual address to lower address. Here, both the physical
1290                  * address and virtual address are in descending order */
1291                 else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
1292                     hugepage[i].size)
1293                         new_memseg = 1;
1294                 else if (((unsigned long)hugepage[i-1].final_va -
1295                     (unsigned long)hugepage[i].final_va) != hugepage[i].size)
1296                         new_memseg = 1;
1297 #else
1298                 else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
1299                     hugepage[i].size)
1300                         new_memseg = 1;
1301                 else if (((unsigned long)hugepage[i].final_va -
1302                     (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
1303                         new_memseg = 1;
1304 #endif
1305
1306                 if (new_memseg) {
1307                         j += 1;
1308                         if (j == RTE_MAX_MEMSEG)
1309                                 break;
1310
1311                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
1312                         mcfg->memseg[j].addr = hugepage[i].final_va;
1313 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1314                         mcfg->memseg[j].len = hugepage[i].size * hugepage[i].repeated;
1315 #else
1316                         mcfg->memseg[j].len = hugepage[i].size;
1317 #endif
1318                         mcfg->memseg[j].socket_id = hugepage[i].socket_id;
1319                         mcfg->memseg[j].hugepage_sz = hugepage[i].size;
1320                 }
1321                 /* continuation of previous memseg */
1322                 else {
1323 #ifdef RTE_ARCH_PPC_64
1324                 /* Use the phy and virt address of the last page as segment
1325                  * address for IBM Power architecture */
1326                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
1327                         mcfg->memseg[j].addr = hugepage[i].final_va;
1328 #endif
1329                         mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
1330                 }
1331                 hugepage[i].memseg_id = j;
1332         }
1333
1334         if (i < nr_hugefiles) {
1335                 RTE_LOG(ERR, EAL, "Can only reserve %d pages "
1336                         "from %d requested\n"
1337                         "Current %s=%d is not enough\n"
1338                         "Please either increase it or request less amount "
1339                         "of memory.\n",
1340                         i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
1341                         RTE_MAX_MEMSEG);
1342                 return (-ENOMEM);
1343         }
1344
1345         return 0;
1346
1347 fail:
1348         if (tmp_hp)
1349                 free(tmp_hp);
1350         return -1;
1351 }
1352
1353 /*
1354  * uses fstat to report the size of a file on disk
1355  */
1356 static off_t
1357 getFileSize(int fd)
1358 {
1359         struct stat st;
1360         if (fstat(fd, &st) < 0)
1361                 return 0;
1362         return st.st_size;
1363 }
1364
1365 /*
1366  * This creates the memory mappings in the secondary process to match that of
1367  * the server process. It goes through each memory segment in the DPDK runtime
1368  * configuration and finds the hugepages which form that segment, mapping them
1369  * in order to form a contiguous block in the virtual memory space
1370  */
1371 static int
1372 rte_eal_hugepage_attach(void)
1373 {
1374         const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1375         const struct hugepage_file *hp = NULL;
1376         unsigned num_hp = 0;
1377         unsigned i, s = 0; /* s used to track the segment number */
1378         off_t size;
1379         int fd, fd_zero = -1, fd_hugepage = -1;
1380
1381         if (aslr_enabled() > 0) {
1382                 RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
1383                                 "(ASLR) is enabled in the kernel.\n");
1384                 RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
1385                                 "into secondary processes\n");
1386         }
1387
1388         if (internal_config.xen_dom0_support) {
1389 #ifdef RTE_LIBRTE_XEN_DOM0
1390                 if (rte_xen_dom0_memory_attach() < 0) {
1391                         RTE_LOG(ERR, EAL,"Failed to attach memory setments of primay "
1392                                         "process\n");
1393                         return -1;
1394                 }
1395                 return 0;
1396 #endif
1397         }
1398
1399         fd_zero = open("/dev/zero", O_RDONLY);
1400         if (fd_zero < 0) {
1401                 RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
1402                 goto error;
1403         }
1404         fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
1405         if (fd_hugepage < 0) {
1406                 RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
1407                 goto error;
1408         }
1409
1410         /* map all segments into memory to make sure we get the addrs */
1411         for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
1412                 void *base_addr;
1413
1414                 /*
1415                  * the first memory segment with len==0 is the one that
1416                  * follows the last valid segment.
1417                  */
1418                 if (mcfg->memseg[s].len == 0)
1419                         break;
1420
1421 #ifdef RTE_LIBRTE_IVSHMEM
1422                 /*
1423                  * if segment has ioremap address set, it's an IVSHMEM segment and
1424                  * doesn't need mapping as it was already mapped earlier
1425                  */
1426                 if (mcfg->memseg[s].ioremap_addr != 0)
1427                         continue;
1428 #endif
1429
1430                 /*
1431                  * fdzero is mmapped to get a contiguous block of virtual
1432                  * addresses of the appropriate memseg size.
1433                  * use mmap to get identical addresses as the primary process.
1434                  */
1435                 base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
1436                                  PROT_READ, MAP_PRIVATE, fd_zero, 0);
1437                 if (base_addr == MAP_FAILED ||
1438                     base_addr != mcfg->memseg[s].addr) {
1439                         RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
1440                                 "in /dev/zero to requested address [%p]: '%s'\n",
1441                                 (unsigned long long)mcfg->memseg[s].len,
1442                                 mcfg->memseg[s].addr, strerror(errno));
1443                         if (aslr_enabled() > 0) {
1444                                 RTE_LOG(ERR, EAL, "It is recommended to "
1445                                         "disable ASLR in the kernel "
1446                                         "and retry running both primary "
1447                                         "and secondary processes\n");
1448                         }
1449                         goto error;
1450                 }
1451         }
1452
1453         size = getFileSize(fd_hugepage);
1454         hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1455         if (hp == NULL) {
1456                 RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
1457                 goto error;
1458         }
1459
1460         num_hp = size / sizeof(struct hugepage_file);
1461         RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
1462
1463         s = 0;
1464         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
1465                 void *addr, *base_addr;
1466                 uintptr_t offset = 0;
1467                 size_t mapping_size;
1468 #ifdef RTE_LIBRTE_IVSHMEM
1469                 /*
1470                  * if segment has ioremap address set, it's an IVSHMEM segment and
1471                  * doesn't need mapping as it was already mapped earlier
1472                  */
1473                 if (mcfg->memseg[s].ioremap_addr != 0) {
1474                         s++;
1475                         continue;
1476                 }
1477 #endif
1478                 /*
1479                  * free previously mapped memory so we can map the
1480                  * hugepages into the space
1481                  */
1482                 base_addr = mcfg->memseg[s].addr;
1483                 munmap(base_addr, mcfg->memseg[s].len);
1484
1485                 /* find the hugepages for this segment and map them
1486                  * we don't need to worry about order, as the server sorted the
1487                  * entries before it did the second mmap of them */
1488                 for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
1489                         if (hp[i].memseg_id == (int)s){
1490                                 fd = open(hp[i].filepath, O_RDWR);
1491                                 if (fd < 0) {
1492                                         RTE_LOG(ERR, EAL, "Could not open %s\n",
1493                                                 hp[i].filepath);
1494                                         goto error;
1495                                 }
1496 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
1497                                 mapping_size = hp[i].size * hp[i].repeated;
1498 #else
1499                                 mapping_size = hp[i].size;
1500 #endif
1501                                 addr = mmap(RTE_PTR_ADD(base_addr, offset),
1502                                                 mapping_size, PROT_READ | PROT_WRITE,
1503                                                 MAP_SHARED, fd, 0);
1504                                 close(fd); /* close file both on success and on failure */
1505                                 if (addr == MAP_FAILED ||
1506                                                 addr != RTE_PTR_ADD(base_addr, offset)) {
1507                                         RTE_LOG(ERR, EAL, "Could not mmap %s\n",
1508                                                 hp[i].filepath);
1509                                         goto error;
1510                                 }
1511                                 offset+=mapping_size;
1512                         }
1513                 }
1514                 RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
1515                                 (unsigned long long)mcfg->memseg[s].len);
1516                 s++;
1517         }
1518         /* unmap the hugepage config file, since we are done using it */
1519         munmap((void *)(uintptr_t)hp, size);
1520         close(fd_zero);
1521         close(fd_hugepage);
1522         return 0;
1523
1524 error:
1525         if (fd_zero >= 0)
1526                 close(fd_zero);
1527         if (fd_hugepage >= 0)
1528                 close(fd_hugepage);
1529         return -1;
1530 }
1531
1532 static int
1533 rte_eal_memdevice_init(void)
1534 {
1535         struct rte_config *config;
1536
1537         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1538                 return 0;
1539
1540         config = rte_eal_get_configuration();
1541         config->mem_config->nchannel = internal_config.force_nchannel;
1542         config->mem_config->nrank = internal_config.force_nrank;
1543
1544         return 0;
1545 }
1546
1547
1548 /* init memory subsystem */
1549 int
1550 rte_eal_memory_init(void)
1551 {
1552         RTE_LOG(INFO, EAL, "Setting up memory...\n");
1553         const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1554                         rte_eal_hugepage_init() :
1555                         rte_eal_hugepage_attach();
1556         if (retval < 0)
1557                 return -1;
1558
1559         if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
1560                 return -1;
1561
1562         return 0;
1563 }