ec43cdc3862fe4b83ed9edd13b3a498bb5be02bf
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memory.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without 
8  *   modification, are permitted provided that the following conditions 
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright 
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright 
14  *       notice, this list of conditions and the following disclaimer in 
15  *       the documentation and/or other materials provided with the 
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its 
18  *       contributors may be used to endorse or promote products derived 
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  * 
33  */
34 /*   BSD LICENSE
35  *
36  *   Copyright(c) 2013 6WIND.
37  *
38  *   Redistribution and use in source and binary forms, with or without
39  *   modification, are permitted provided that the following conditions
40  *   are met:
41  *
42  *     * Redistributions of source code must retain the above copyright
43  *       notice, this list of conditions and the following disclaimer.
44  *     * Redistributions in binary form must reproduce the above copyright
45  *       notice, this list of conditions and the following disclaimer in
46  *       the documentation and/or other materials provided with the
47  *       distribution.
48  *     * Neither the name of 6WIND S.A. nor the names of its
49  *       contributors may be used to endorse or promote products derived
50  *       from this software without specific prior written permission.
51  *
52  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
56  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63  */
64
65 #include <errno.h>
66 #include <stdarg.h>
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <stdint.h>
70 #include <inttypes.h>
71 #include <string.h>
72 #include <stdarg.h>
73 #include <sys/mman.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <sys/queue.h>
77 #include <sys/file.h>
78 #include <unistd.h>
79 #include <limits.h>
80 #include <errno.h>
81 #include <sys/ioctl.h>
82 #include <sys/time.h>
83 #include <sys/resource.h>
84
85 #include <rte_log.h>
86 #include <rte_memory.h>
87 #include <rte_memzone.h>
88 #include <rte_launch.h>
89 #include <rte_tailq.h>
90 #include <rte_eal.h>
91 #include <rte_eal_memconfig.h>
92 #include <rte_per_lcore.h>
93 #include <rte_lcore.h>
94 #include <rte_common.h>
95 #include <rte_string_fns.h>
96
97 #include "eal_private.h"
98 #include "eal_internal_cfg.h"
99 #include "eal_filesystem.h"
100 #include "eal_hugepages.h"
101
102 /**
103  * @file
104  * Huge page mapping under linux
105  *
106  * To reserve a big contiguous amount of memory, we use the hugepage
107  * feature of linux. For that, we need to have hugetlbfs mounted. This
108  * code will create many files in this directory (one per page) and
109  * map them in virtual memory. For each page, we will retrieve its
110  * physical address and remap it in order to have a virtual contiguous
111  * zone as well as a physical contiguous zone.
112  */
113
114
115 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
116
117 /*
118  * Check whether address-space layout randomization is enabled in
119  * the kernel. This is important for multi-process as it can prevent
120  * two processes mapping data to the same virtual address
121  * Returns:
122  *    0 - address space randomization disabled
123  *    1/2 - address space randomization enabled
124  *    negative error code on error
125  */
126 static int
127 aslr_enabled(void)
128 {
129         char c;
130         int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
131         if (fd < 0)
132                 return -errno;
133         retval = read(fd, &c, 1);
134         close(fd);
135         if (retval < 0)
136                 return -errno;
137         if (retval == 0)
138                 return -EIO;
139         switch (c) {
140                 case '0' : return 0;
141                 case '1' : return 1;
142                 case '2' : return 2;
143                 default: return -EINVAL;
144         }
145 }
146
147 /*
148  * Increase limit for open files for current process
149  */
150 static int
151 increase_open_file_limit(void)
152 {
153         struct rlimit limit;
154
155         /* read current limits */
156         if (getrlimit(RLIMIT_NOFILE, &limit) != 0) {
157                 RTE_LOG(ERR, EAL, "Error reading resource limit: %s\n",
158                                 strerror(errno));
159                 return -1;
160         }
161
162         /* check if current soft limit matches the hard limit */
163         if (limit.rlim_cur < limit.rlim_max) {
164                 /* set soft limit to match hard limit */
165                 limit.rlim_cur = limit.rlim_max;
166         }
167         else {
168                 /* we can't increase the soft limit so now we try to increase
169                  * soft and hard limit. this might fail when run as non-root.
170                  */
171                 limit.rlim_cur *= 2;
172                 limit.rlim_max *= 2;
173         }
174
175         /* set current resource limit */
176         if (setrlimit(RLIMIT_NOFILE, &limit) != 0) {
177                 RTE_LOG(ERR, EAL, "Error increasing open files limit: %s\n",
178                                 strerror(errno));
179                 return -1;
180         }
181
182         return 0;
183 }
184
185 /*
186  * Try to mmap *size bytes in /dev/zero. If it is succesful, return the
187  * pointer to the mmap'd area and keep *size unmodified. Else, retry
188  * with a smaller zone: decrease *size by hugepage_sz until it reaches
189  * 0. In this case, return NULL. Note: this function returns an address
190  * which is a multiple of hugepage size.
191  */
192 static void *
193 get_virtual_area(uint64_t *size, uint64_t hugepage_sz)
194 {
195         void *addr;
196         int fd;
197         long aligned_addr;
198
199         RTE_LOG(INFO, EAL, "Ask a virtual area of 0x%"PRIx64" bytes\n", *size);
200
201         fd = open("/dev/zero", O_RDONLY);
202         if (fd < 0){
203                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
204                 return NULL;
205         }
206         do {
207                 addr = mmap(NULL, (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
208                 if (addr == MAP_FAILED)
209                         *size -= hugepage_sz;
210         } while (addr == MAP_FAILED && *size > 0);
211
212         if (addr == MAP_FAILED) {
213                 close(fd);
214                 RTE_LOG(INFO, EAL, "Cannot get a virtual area\n");
215                 return NULL;
216         }
217
218         munmap(addr, (*size) + hugepage_sz);
219         close(fd);
220
221         /* align addr to a huge page size boundary */
222         aligned_addr = (long)addr;
223         aligned_addr += (hugepage_sz - 1);
224         aligned_addr &= (~(hugepage_sz - 1));
225         addr = (void *)(aligned_addr);
226
227         RTE_LOG(INFO, EAL, "Virtual area found at %p (size = 0x%"PRIx64")\n",
228                 addr, *size);
229
230         return addr;
231 }
232
233 /*
234  * Mmap all hugepages of hugepage table: it first open a file in
235  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
236  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
237  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
238  * map continguous physical blocks in contiguous virtual blocks.
239  */
240 static int
241 map_all_hugepages(struct hugepage *hugepg_tbl,
242                 struct hugepage_info *hpi, int orig)
243 {
244         int fd;
245         unsigned i;
246         void *virtaddr;
247         void *vma_addr = NULL;
248         uint64_t vma_len = 0;
249
250         for (i = 0; i < hpi->num_pages[0]; i++) {
251                 uint64_t hugepage_sz = hpi->hugepage_sz;
252
253                 if (orig) {
254                         hugepg_tbl[i].file_id = i;
255                         hugepg_tbl[i].size = hugepage_sz;
256                         eal_get_hugefile_path(hugepg_tbl[i].filepath,
257                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
258                                         hugepg_tbl[i].file_id);
259                         hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
260                 }
261 #ifndef RTE_ARCH_X86_64
262                 /* for 32-bit systems, don't remap 1G pages, just reuse original
263                  * map address as final map address.
264                  */
265                 else if (hugepage_sz == RTE_PGSIZE_1G){
266                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
267                         hugepg_tbl[i].orig_va = NULL;
268                         continue;
269                 }
270 #endif
271                 else if (vma_len == 0) {
272                         unsigned j, num_pages;
273
274                         /* reserve a virtual area for next contiguous
275                          * physical block: count the number of
276                          * contiguous physical pages. */
277                         for (j = i+1; j < hpi->num_pages[0] ; j++) {
278                                 if (hugepg_tbl[j].physaddr !=
279                                     hugepg_tbl[j-1].physaddr + hugepage_sz)
280                                         break;
281                         }
282                         num_pages = j - i;
283                         vma_len = num_pages * hugepage_sz;
284
285                         /* get the biggest virtual memory area up to
286                          * vma_len. If it fails, vma_addr is NULL, so
287                          * let the kernel provide the address. */
288                         vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
289                         if (vma_addr == NULL)
290                                 vma_len = hugepage_sz;
291                 }
292
293                 /* try to create hugepage file */
294                 fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
295                 if (fd < 0) {
296                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
297                                         strerror(errno));
298                         return -1;
299                 }
300
301                 virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
302                                 MAP_SHARED, fd, 0);
303                 if (virtaddr == MAP_FAILED) {
304                         RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
305                                         strerror(errno));
306                         close(fd);
307                         return -1;
308                 }
309
310                 if (orig) {
311                         hugepg_tbl[i].orig_va = virtaddr;
312                         memset(virtaddr, 0, hugepage_sz);
313                 }
314                 else {
315                         hugepg_tbl[i].final_va = virtaddr;
316                 }
317
318                 /* close the file descriptor, files will be locked later */
319                 close(fd);
320
321                 vma_addr = (char *)vma_addr + hugepage_sz;
322                 vma_len -= hugepage_sz;
323         }
324         return 0;
325 }
326
327 /* Unmap all hugepages from original mapping. */
328 static int
329 unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
330 {
331         unsigned i;
332         for (i = 0; i < hpi->num_pages[0]; i++) {
333                 if (hugepg_tbl[i].orig_va) {
334                         munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
335                         hugepg_tbl[i].orig_va = NULL;
336                 }
337         }
338         return 0;
339 }
340
341 /*
342  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
343  * it by browsing the /proc/self/pagemap special file.
344  */
345 static int
346 find_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
347 {
348         int fd;
349         unsigned i;
350         uint64_t page;
351         unsigned long virt_pfn;
352         int page_size;
353
354         /* standard page size */
355         page_size = getpagesize();
356
357         fd = open("/proc/self/pagemap", O_RDONLY);
358         if (fd < 0) {
359                 RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
360                         __func__, strerror(errno));
361                 return -1;
362         }
363
364         for (i = 0; i < hpi->num_pages[0]; i++) {
365                 off_t offset;
366                 virt_pfn = (unsigned long)hugepg_tbl[i].orig_va /
367                         page_size;
368                 offset = sizeof(uint64_t) * virt_pfn;
369                 if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
370                         RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
371                                         __func__, strerror(errno));
372                         close(fd);
373                         return -1;
374                 }
375                 if (read(fd, &page, sizeof(uint64_t)) < 0) {
376                         RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
377                                         __func__, strerror(errno));
378                         close(fd);
379                         return -1;
380                 }
381
382                 /*
383                  * the pfn (page frame number) are bits 0-54 (see
384                  * pagemap.txt in linux Documentation)
385                  */
386                 hugepg_tbl[i].physaddr = ((page & 0x7fffffffffffffULL) * page_size);
387         }
388         close(fd);
389         return 0;
390 }
391
392 /*
393  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
394  * page.
395  */
396 static int
397 find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
398 {
399         int socket_id;
400         char *end, *nodestr;
401         unsigned i, hp_count = 0;
402         uint64_t virt_addr;
403         char buf[BUFSIZ];
404         char hugedir_str[PATH_MAX];
405         FILE *f;
406
407         f = fopen("/proc/self/numa_maps", "r");
408         if (f == NULL) {
409                 RTE_LOG(INFO, EAL, "cannot open /proc/self/numa_maps,"
410                                 " consider that all memory is in socket_id 0\n");
411                 return 0;
412         }
413
414         rte_snprintf(hugedir_str, sizeof(hugedir_str),
415                         "%s/", hpi->hugedir);
416
417         /* parse numa map */
418         while (fgets(buf, sizeof(buf), f) != NULL) {
419
420                 /* ignore non huge page */
421                 if (strstr(buf, " huge ") == NULL &&
422                                 strstr(buf, hugedir_str) == NULL)
423                         continue;
424
425                 /* get zone addr */
426                 virt_addr = strtoull(buf, &end, 16);
427                 if (virt_addr == 0 || end == buf) {
428                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
429                         goto error;
430                 }
431
432                 /* get node id (socket id) */
433                 nodestr = strstr(buf, " N");
434                 if (nodestr == NULL) {
435                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
436                         goto error;
437                 }
438                 nodestr += 2;
439                 end = strstr(nodestr, "=");
440                 if (end == NULL) {
441                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
442                         goto error;
443                 }
444                 end[0] = '\0';
445                 end = NULL;
446
447                 socket_id = strtoul(nodestr, &end, 0);
448                 if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
449                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
450                         goto error;
451                 }
452
453                 /* if we find this page in our mappings, set socket_id */
454                 for (i = 0; i < hpi->num_pages[0]; i++) {
455                         void *va = (void *)(unsigned long)virt_addr;
456                         if (hugepg_tbl[i].orig_va == va) {
457                                 hugepg_tbl[i].socket_id = socket_id;
458                                 hp_count++;
459                         }
460                 }
461         }
462
463         if (hp_count < hpi->num_pages[0])
464                 goto error;
465
466         fclose(f);
467         return 0;
468
469 error:
470         fclose(f);
471         return -1;
472 }
473
474 /*
475  * Sort the hugepg_tbl by physical address (lower addresses first). We
476  * use a slow algorithm, but we won't have millions of pages, and this
477  * is only done at init time.
478  */
479 static int
480 sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
481 {
482         unsigned i, j;
483         int smallest_idx;
484         uint64_t smallest_addr;
485         struct hugepage tmp;
486
487         for (i = 0; i < hpi->num_pages[0]; i++) {
488                 smallest_addr = 0;
489                 smallest_idx = -1;
490
491                 /*
492                  * browse all entries starting at 'i', and find the
493                  * entry with the smallest addr
494                  */
495                 for (j=i; j< hpi->num_pages[0]; j++) {
496
497                         if (smallest_addr == 0 ||
498                             hugepg_tbl[j].physaddr < smallest_addr) {
499                                 smallest_addr = hugepg_tbl[j].physaddr;
500                                 smallest_idx = j;
501                         }
502                 }
503
504                 /* should not happen */
505                 if (smallest_idx == -1) {
506                         RTE_LOG(ERR, EAL, "%s(): error in physaddr sorting\n", __func__);
507                         return -1;
508                 }
509
510                 /* swap the 2 entries in the table */
511                 memcpy(&tmp, &hugepg_tbl[smallest_idx], sizeof(struct hugepage));
512                 memcpy(&hugepg_tbl[smallest_idx], &hugepg_tbl[i],
513                                 sizeof(struct hugepage));
514                 memcpy(&hugepg_tbl[i], &tmp, sizeof(struct hugepage));
515         }
516         return 0;
517 }
518
519 /*
520  * Uses mmap to create a shared memory area for storage of data
521  * Used in this file to store the hugepage file map on disk
522  */
523 static void *
524 create_shared_memory(const char *filename, const size_t mem_size)
525 {
526         void *retval;
527         int fd = open(filename, O_CREAT | O_RDWR, 0666);
528         if (fd < 0)
529                 return NULL;
530         if (ftruncate(fd, mem_size) < 0) {
531                 close(fd);
532                 return NULL;
533         }
534         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
535         close(fd);
536         return retval;
537 }
538
539 /*
540  * this copies *active* hugepages from one hugepage table to another.
541  * destination is typically the shared memory.
542  */
543 static int
544 copy_hugepages_to_shared_mem(struct hugepage * dst, int dest_size,
545                 const struct hugepage * src, int src_size)
546 {
547         int src_pos, dst_pos = 0;
548
549         for (src_pos = 0; src_pos < src_size; src_pos++) {
550                 if (src[src_pos].final_va != NULL) {
551                         /* error on overflow attempt */
552                         if (dst_pos == dest_size)
553                                 return -1;
554                         memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage));
555                         dst_pos++;
556                 }
557         }
558         return 0;
559 }
560
561 /*
562  * unmaps hugepages that are not going to be used. since we originally allocate
563  * ALL hugepages (not just those we need), additional unmapping needs to be done.
564  */
565 static int
566 unmap_unneeded_hugepages(struct hugepage *hugepg_tbl,
567                 struct hugepage_info *hpi,
568                 unsigned num_hp_info)
569 {
570         unsigned socket, size;
571         int page, nrpages = 0;
572         int fd;
573
574         /* get total number of hugepages */
575         for (size = 0; size < num_hp_info; size++)
576                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
577                         nrpages += internal_config.hugepage_info[size].num_pages[socket];
578
579         for (size = 0; size < num_hp_info; size++) {
580                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
581                         unsigned pages_found = 0;
582                         /* traverse until we have unmapped all the unused pages */
583                         for (page = 0; page < nrpages; page++) {
584                                 struct hugepage *hp = &hugepg_tbl[page];
585
586                                 /* find a page that matches the criteria */
587                                 if ((hp->size == hpi[size].hugepage_sz) &&
588                                                 (hp->socket_id == (int) socket)) {
589
590                                         /* if we skipped enough pages, unmap the rest */
591                                         if (pages_found == hpi[size].num_pages[socket]) {
592                                                 munmap(hp->final_va, hp->size);
593                                                 hp->final_va = NULL;
594                                         }
595                                         /* lock the page and skip */
596                                         else {
597                                                 /* try and open the hugepage file */
598                                                 while ((fd = open(hp->filepath, O_CREAT | O_RDWR, 0755)) < 0) {
599                                                         /* if we can't open due to resource limits */
600                                                         if (errno == EMFILE) {
601                                                                 RTE_LOG(INFO, EAL, "Increasing open file limit\n");
602
603                                                                 /* if we manage to increase resource limit, try again */
604                                                                 if (increase_open_file_limit() == 0)
605                                                                         continue;
606                                                         }
607                                                         else
608                                                                 RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
609                                                                                 strerror(errno));
610                                                         return -1;
611                                                 }
612                                                 /* try and lock the hugepage */
613                                                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
614                                                         RTE_LOG(ERR, EAL, "Locking hugepage file failed!\n");
615                                                         close(fd);
616                                                         return -1;
617                                                 }
618                                                 hp->page_lock = fd;
619                                                 pages_found++;
620                                         }
621                                 } /* match page */
622                         } /* foreach page */
623                 } /* foreach socket */
624         } /* foreach pagesize */
625
626         return 0;
627 }
628
629 static inline uint64_t
630 get_socket_mem_size(int socket)
631 {
632         uint64_t size = 0;
633         unsigned i;
634
635         for (i = 0; i < internal_config.num_hugepage_sizes; i++){
636                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
637                 if (hpi->hugedir != NULL)
638                         size += hpi->hugepage_sz * hpi->num_pages[socket];
639         }
640
641         return (size);
642 }
643
644 /*
645  * This function is a NUMA-aware equivalent of calc_num_pages.
646  * It takes in the list of hugepage sizes and the
647  * number of pages thereof, and calculates the best number of
648  * pages of each size to fulfill the request for <memory> ram
649  */
650 static int
651 calc_num_pages_per_socket(uint64_t * memory,
652                 struct hugepage_info *hp_info,
653                 struct hugepage_info *hp_used,
654                 unsigned num_hp_info)
655 {
656         unsigned socket, j, i = 0;
657         unsigned requested, available;
658         int total_num_pages = 0;
659         uint64_t remaining_mem, cur_mem;
660         uint64_t total_mem = internal_config.memory;
661
662         if (num_hp_info == 0)
663                 return -1;
664
665         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
666                 /* if specific memory amounts per socket weren't requested */
667                 if (internal_config.force_sockets == 0) {
668                         /* take whatever is available */
669                         memory[socket] = RTE_MIN(get_socket_mem_size(socket),
670                                         total_mem);
671                 }
672                 /* skips if the memory on specific socket wasn't requested */
673                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
674                         hp_used[i].hugedir = hp_info[i].hugedir;
675                         hp_used[i].num_pages[socket] = RTE_MIN(
676                                         memory[socket] / hp_info[i].hugepage_sz,
677                                         hp_info[i].num_pages[socket]);
678
679                         cur_mem = hp_used[i].num_pages[socket] *
680                                         hp_used[i].hugepage_sz;
681
682                         memory[socket] -= cur_mem;
683                         total_mem -= cur_mem;
684
685                         total_num_pages += hp_used[i].num_pages[socket];
686
687                         /* check if we have met all memory requests */
688                         if (memory[socket] == 0)
689                                 break;
690
691                         /* check if we have any more pages left at this size, if so
692                          * move on to next size */
693                         if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
694                                 continue;
695                         /* At this point we know that there are more pages available that are
696                          * bigger than the memory we want, so lets see if we can get enough
697                          * from other page sizes.
698                          */
699                         remaining_mem = 0;
700                         for (j = i+1; j < num_hp_info; j++)
701                                 remaining_mem += hp_info[j].hugepage_sz *
702                                 hp_info[j].num_pages[socket];
703
704                         /* is there enough other memory, if not allocate another page and quit */
705                         if (remaining_mem < memory[socket]){
706                                 cur_mem = RTE_MIN(memory[socket],
707                                                 hp_info[i].hugepage_sz);
708                                 memory[socket] -= cur_mem;
709                                 total_mem -= cur_mem;
710                                 hp_used[i].num_pages[socket]++;
711                                 total_num_pages++;
712                                 break; /* we are done with this socket*/
713                         }
714                 }
715                 /* if we didn't satisfy all memory requirements per socket */
716                 if (memory[socket] > 0) {
717                         /* to prevent icc errors */
718                         requested = (unsigned) (internal_config.socket_mem[socket] /
719                                         0x100000);
720                         available = requested -
721                                         ((unsigned) (memory[socket] / 0x100000));
722                         RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! "
723                                         "Requested: %uMB, available: %uMB\n", socket,
724                                         requested, available);
725                         return -1;
726                 }
727         }
728
729         /* if we didn't satisfy total memory requirements */
730         if (total_mem > 0) {
731                 requested = (unsigned) (internal_config.memory / 0x100000);
732                 available = requested - (unsigned) (total_mem / 0x100000);
733                 RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB,"
734                                 " available: %uMB\n", requested, available);
735                 return -1;
736         }
737         return total_num_pages;
738 }
739
740 /*
741  * Prepare physical memory mapping: fill configuration structure with
742  * these infos, return 0 on success.
743  *  1. map N huge pages in separate files in hugetlbfs
744  *  2. find associated physical addr
745  *  3. find associated NUMA socket ID
746  *  4. sort all huge pages by physical address
747  *  5. remap these N huge pages in the correct order
748  *  6. unmap the first mapping
749  *  7. fill memsegs in configuration with contiguous zones
750  */
751 static int
752 rte_eal_hugepage_init(void)
753 {
754         struct rte_mem_config *mcfg;
755         struct hugepage *hugepage, *tmp_hp = NULL;
756         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
757
758         uint64_t memory[RTE_MAX_NUMA_NODES];
759
760         unsigned hp_offset;
761         int i, j, new_memseg;
762         int nrpages, total_pages = 0;
763         void *addr;
764
765         memset(used_hp, 0, sizeof(used_hp));
766
767         /* get pointer to global configuration */
768         mcfg = rte_eal_get_configuration()->mem_config;
769
770         /* for debug purposes, hugetlbfs can be disabled */
771         if (internal_config.no_hugetlbfs) {
772                 addr = malloc(internal_config.memory);
773                 mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
774                 mcfg->memseg[0].addr = addr;
775                 mcfg->memseg[0].len = internal_config.memory;
776                 mcfg->memseg[0].socket_id = 0;
777                 return 0;
778         }
779
780
781         /* calculate total number of hugepages available. at this point we haven't
782          * yet started sorting them so they all are on socket 0 */
783         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
784                 /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
785                 used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
786
787                 total_pages += internal_config.hugepage_info[i].num_pages[0];
788         }
789
790         /*
791          * allocate a memory area for hugepage table.
792          * this isn't shared memory yet. due to the fact that we need some
793          * processing done on these pages, shared memory will be created
794          * at a later stage.
795          */
796         tmp_hp = malloc(total_pages * sizeof(struct hugepage));
797         if (tmp_hp == NULL)
798                 goto fail;
799
800         memset(tmp_hp, 0, total_pages * sizeof(struct hugepage));
801
802         hp_offset = 0; /* where we start the current page size entries */
803
804         /* map all hugepages and sort them */
805         for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
806                 struct hugepage_info *hpi;
807
808                 /*
809                  * we don't yet mark hugepages as used at this stage, so
810                  * we just map all hugepages available to the system
811                  * all hugepages are still located on socket 0
812                  */
813                 hpi = &internal_config.hugepage_info[i];
814
815                 if (hpi->num_pages == 0)
816                         continue;
817
818                 /* map all hugepages available */
819                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
820                         RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
821                                         (unsigned)(hpi->hugepage_sz / 0x100000));
822                         goto fail;
823                 }
824
825                 /* find physical addresses and sockets for each hugepage */
826                 if (find_physaddr(&tmp_hp[hp_offset], hpi) < 0){
827                         RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
828                                         (unsigned)(hpi->hugepage_sz / 0x100000));
829                         goto fail;
830                 }
831
832                 if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
833                         RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
834                                         (unsigned)(hpi->hugepage_sz / 0x100000));
835                         goto fail;
836                 }
837
838                 if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)
839                         goto fail;
840
841                 /* remap all hugepages */
842                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
843                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
844                                         (unsigned)(hpi->hugepage_sz / 0x100000));
845                         goto fail;
846                 }
847
848                 /* unmap original mappings */
849                 if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
850                         goto fail;
851
852                 /* we have processed a num of hugepages of this size, so inc offset */
853                 hp_offset += hpi->num_pages[0];
854         }
855
856         /* clean out the numbers of pages */
857         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
858                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
859                         internal_config.hugepage_info[i].num_pages[j] = 0;
860
861         /* get hugepages for each socket */
862         for (i = 0; i < total_pages; i++) {
863                 int socket = tmp_hp[i].socket_id;
864
865                 /* find a hugepage info with right size and increment num_pages */
866                 for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) {
867                         if (tmp_hp[i].size ==
868                                         internal_config.hugepage_info[j].hugepage_sz) {
869                                 internal_config.hugepage_info[j].num_pages[socket]++;
870                         }
871                 }
872         }
873
874         /* make a copy of socket_mem, needed for number of pages calculation */
875         for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
876                 memory[i] = internal_config.socket_mem[i];
877
878         /* calculate final number of pages */
879         nrpages = calc_num_pages_per_socket(memory,
880                         internal_config.hugepage_info, used_hp,
881                         internal_config.num_hugepage_sizes);
882
883         /* error if not enough memory available */
884         if (nrpages < 0)
885                 goto fail;
886
887         /* reporting in! */
888         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
889                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
890                         if (used_hp[i].num_pages[j] > 0) {
891                                 RTE_LOG(INFO, EAL,
892                                                 "Requesting %u pages of size %uMB"
893                                                 " from socket %i\n",
894                                                 used_hp[i].num_pages[j],
895                                                 (unsigned)
896                                                         (used_hp[i].hugepage_sz / 0x100000),
897                                                 j);
898                         }
899                 }
900         }
901
902         /* create shared memory */
903         hugepage = create_shared_memory(eal_hugepage_info_path(),
904                                         nrpages * sizeof(struct hugepage));
905
906         if (hugepage == NULL) {
907                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
908                 goto fail;
909         }
910
911         /*
912          * unmap pages that we won't need (looks at used_hp).
913          * also, sets final_va to NULL on pages that were unmapped.
914          */
915         if (unmap_unneeded_hugepages(tmp_hp, used_hp,
916                         internal_config.num_hugepage_sizes) < 0) {
917                 RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
918                 goto fail;
919         }
920
921         /*
922          * copy stuff from malloc'd hugepage* to the actual shared memory.
923          * this procedure only copies those hugepages that have final_va
924          * not NULL. has overflow protection.
925          */
926         if (copy_hugepages_to_shared_mem(hugepage, nrpages,
927                         tmp_hp, total_pages) < 0) {
928                 RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
929                 goto fail;
930         }
931
932         /* free the temporary hugepage table */
933         free(tmp_hp);
934         tmp_hp = NULL;
935
936         memset(mcfg->memseg, 0, sizeof(mcfg->memseg));
937         j = -1;
938         for (i = 0; i < nrpages; i++) {
939                 new_memseg = 0;
940
941                 /* if this is a new section, create a new memseg */
942                 if (i == 0)
943                         new_memseg = 1;
944                 else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
945                         new_memseg = 1;
946                 else if (hugepage[i].size != hugepage[i-1].size)
947                         new_memseg = 1;
948                 else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
949                     hugepage[i].size)
950                         new_memseg = 1;
951                 else if (((unsigned long)hugepage[i].final_va -
952                     (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
953                         new_memseg = 1;
954
955                 if (new_memseg) {
956                         j += 1;
957                         if (j == RTE_MAX_MEMSEG)
958                                 break;
959
960                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
961                         mcfg->memseg[j].addr = hugepage[i].final_va;
962                         mcfg->memseg[j].len = hugepage[i].size;
963                         mcfg->memseg[j].socket_id = hugepage[i].socket_id;
964                         mcfg->memseg[j].hugepage_sz = hugepage[i].size;
965                 }
966                 /* continuation of previous memseg */
967                 else {
968                         mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
969                 }
970                 hugepage[i].memseg_id = j;
971         }
972
973         if (i < nrpages) {
974                 RTE_LOG(ERR, EAL, "Can only reserve %d pages "
975                         "from %d requested\n"
976                         "Current %s=%d is not enough\n"
977                         "Please either increase it or request less amount "
978                         "of memory.\n",
979                         i, nrpages, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
980                         RTE_MAX_MEMSEG);
981                 return (-ENOMEM);
982         }
983         
984
985         return 0;
986
987
988 fail:
989         if (tmp_hp)
990                 free(tmp_hp);
991         return -1;
992 }
993
994 /*
995  * uses fstat to report the size of a file on disk
996  */
997 static off_t
998 getFileSize(int fd)
999 {
1000         struct stat st;
1001         if (fstat(fd, &st) < 0)
1002                 return 0;
1003         return st.st_size;
1004 }
1005
1006 /*
1007  * This creates the memory mappings in the secondary process to match that of
1008  * the server process. It goes through each memory segment in the DPDK runtime
1009  * configuration and finds the hugepages which form that segment, mapping them
1010  * in order to form a contiguous block in the virtual memory space
1011  */
1012 static int
1013 rte_eal_hugepage_attach(void)
1014 {
1015         const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
1016         const struct hugepage *hp = NULL;
1017         unsigned num_hp = 0;
1018         unsigned i, s = 0; /* s used to track the segment number */
1019         off_t size;
1020         int fd, fd_zero = -1, fd_hugepage = -1;
1021
1022         if (aslr_enabled() > 0) {
1023                 RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
1024                                 "(ASLR) is enabled in the kernel.\n");
1025                 RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
1026                                 "into secondary processes\n");
1027         }
1028
1029         fd_zero = open("/dev/zero", O_RDONLY);
1030         if (fd_zero < 0) {
1031                 RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
1032                 goto error;
1033         }
1034         fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
1035         if (fd_hugepage < 0) {
1036                 RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
1037                 goto error;
1038         }
1039
1040         size = getFileSize(fd_hugepage);
1041         hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1042         if (hp == NULL) {
1043                 RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
1044                 goto error;
1045         }
1046
1047         num_hp = size / sizeof(struct hugepage);
1048         RTE_LOG(DEBUG, EAL, "Analysing %u hugepages\n", num_hp);
1049
1050         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
1051                 void *addr, *base_addr;
1052                 uintptr_t offset = 0;
1053
1054                 /* fdzero is mmapped to get a contiguous block of virtual addresses
1055                  * get a block of free memory of the appropriate size -
1056                  * use mmap to attempt to get an identical address as server.
1057                  */
1058                 base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
1059                                 PROT_READ, MAP_PRIVATE, fd_zero, 0);
1060                 if (base_addr == MAP_FAILED || base_addr != mcfg->memseg[s].addr) {
1061                         RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
1062                                 "in /dev/zero to requested address [%p]\n",
1063                                 (unsigned long long)mcfg->memseg[s].len,
1064                                 mcfg->memseg[s].addr);
1065                         if (aslr_enabled() > 0)
1066                                 RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel "
1067                                                 "and retry running both primary and secondary processes\n");
1068                         goto error;
1069                 }
1070                 /* free memory so we can map the hugepages into the space */
1071                 munmap(base_addr, mcfg->memseg[s].len);
1072
1073                 /* find the hugepages for this segment and map them
1074                  * we don't need to worry about order, as the server sorted the
1075                  * entries before it did the second mmap of them */
1076                 for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
1077                         if (hp[i].memseg_id == (int)s){
1078                                 fd = open(hp[i].filepath, O_RDWR);
1079                                 if (fd < 0) {
1080                                         RTE_LOG(ERR, EAL, "Could not open %s\n",
1081                                                 hp[i].filepath);
1082                                         goto error;
1083                                 }
1084                                 addr = mmap(RTE_PTR_ADD(base_addr, offset),
1085                                                 hp[i].size, PROT_READ | PROT_WRITE,
1086                                                 MAP_SHARED | MAP_FIXED, fd, 0);
1087                                 close(fd); /* close file both on success and on failure */
1088                                 if (addr == MAP_FAILED) {
1089                                         RTE_LOG(ERR, EAL, "Could not mmap %s\n",
1090                                                 hp[i].filepath);
1091                                         goto error;
1092                                 }
1093                                 offset+=hp[i].size;
1094                         }
1095                 }
1096                 RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
1097                                 (unsigned long long)mcfg->memseg[s].len);
1098                 s++;
1099         }
1100         close(fd_zero);
1101         close(fd_hugepage);
1102         return 0;
1103
1104 error:
1105         if (fd_zero >= 0)
1106                 close(fd_zero);
1107         if (fd_hugepage >= 0)
1108                 close(fd_hugepage);
1109         return -1;
1110 }
1111
1112 static int
1113 rte_eal_memdevice_init(void)
1114 {
1115         struct rte_config *config;
1116
1117         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1118                 return 0;
1119
1120         config = rte_eal_get_configuration();
1121         config->mem_config->nchannel = internal_config.force_nchannel;
1122         config->mem_config->nrank = internal_config.force_nrank;
1123
1124         return 0;
1125 }
1126
1127
1128 /* init memory subsystem */
1129 int
1130 rte_eal_memory_init(void)
1131 {
1132         RTE_LOG(INFO, EAL, "Setting up hugepage memory...\n");
1133         const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1134                         rte_eal_hugepage_init() :
1135                         rte_eal_hugepage_attach();
1136         if (retval < 0)
1137                 return -1;
1138
1139         if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
1140                 return -1;
1141
1142         return 0;
1143 }