memory: fix for multi process support
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memory.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2012 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without 
8  *   modification, are permitted provided that the following conditions 
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright 
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright 
14  *       notice, this list of conditions and the following disclaimer in 
15  *       the documentation and/or other materials provided with the 
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its 
18  *       contributors may be used to endorse or promote products derived 
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  * 
33  */
34
35 #include <errno.h>
36 #include <stdarg.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <stdint.h>
40 #include <inttypes.h>
41 #include <string.h>
42 #include <stdarg.h>
43 #include <sys/mman.h>
44 #include <sys/types.h>
45 #include <sys/stat.h>
46 #include <sys/queue.h>
47 #include <sys/file.h>
48 #include <unistd.h>
49 #include <limits.h>
50 #include <errno.h>
51 #include <sys/ioctl.h>
52 #include <sys/time.h>
53 #include <sys/resource.h>
54
55 #include <rte_log.h>
56 #include <rte_memory.h>
57 #include <rte_memzone.h>
58 #include <rte_launch.h>
59 #include <rte_tailq.h>
60 #include <rte_eal.h>
61 #include <rte_eal_memconfig.h>
62 #include <rte_per_lcore.h>
63 #include <rte_lcore.h>
64 #include <rte_common.h>
65 #include <rte_string_fns.h>
66
67 #include "eal_private.h"
68 #include "eal_internal_cfg.h"
69 #include "eal_filesystem.h"
70 #include "eal_hugepages.h"
71
72 /**
73  * @file
74  * Huge page mapping under linux
75  *
76  * To reserve a big contiguous amount of memory, we use the hugepage
77  * feature of linux. For that, we need to have hugetlbfs mounted. This
78  * code will create many files in this directory (one per page) and
79  * map them in virtual memory. For each page, we will retrieve its
80  * physical address and remap it in order to have a virtual contiguous
81  * zone as well as a physical contiguous zone.
82  */
83
84
85 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
86
87 /*
88  * Check whether address-space layout randomization is enabled in
89  * the kernel. This is important for multi-process as it can prevent
90  * two processes mapping data to the same virtual address
91  * Returns:
92  *    0 - address space randomization disabled
93  *    1/2 - address space randomization enabled
94  *    negative error code on error
95  */
96 static int
97 aslr_enabled(void)
98 {
99         char c;
100         int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
101         if (fd < 0)
102                 return -errno;
103         retval = read(fd, &c, 1);
104         close(fd);
105         if (retval < 0)
106                 return -errno;
107         if (retval == 0)
108                 return -EIO;
109         switch (c) {
110                 case '0' : return 0;
111                 case '1' : return 1;
112                 case '2' : return 2;
113                 default: return -EINVAL;
114         }
115 }
116
117 /*
118  * Increase limit for open files for current process
119  */
120 static int
121 increase_open_file_limit(void)
122 {
123         struct rlimit limit;
124
125         /* read current limits */
126         if (getrlimit(RLIMIT_NOFILE, &limit) != 0) {
127                 RTE_LOG(ERR, EAL, "Error reading resource limit: %s\n",
128                                 strerror(errno));
129                 return -1;
130         }
131
132         /* check if current soft limit matches the hard limit */
133         if (limit.rlim_cur < limit.rlim_max) {
134                 /* set soft limit to match hard limit */
135                 limit.rlim_cur = limit.rlim_max;
136         }
137         else {
138                 /* we can't increase the soft limit so now we try to increase
139                  * soft and hard limit. this might fail when run as non-root.
140                  */
141                 limit.rlim_cur *= 2;
142                 limit.rlim_max *= 2;
143         }
144
145         /* set current resource limit */
146         if (setrlimit(RLIMIT_NOFILE, &limit) != 0) {
147                 RTE_LOG(ERR, EAL, "Error increasing open files limit: %s\n",
148                                 strerror(errno));
149                 return -1;
150         }
151
152         return 0;
153 }
154
155 /*
156  * Try to mmap *size bytes in /dev/zero. If it is succesful, return the
157  * pointer to the mmap'd area and keep *size unmodified. Else, retry
158  * with a smaller zone: decrease *size by hugepage_sz until it reaches
159  * 0. In this case, return NULL. Note: this function returns an address
160  * which is a multiple of hugepage size.
161  */
162 static void *
163 get_virtual_area(uint64_t *size, uint64_t hugepage_sz)
164 {
165         void *addr;
166         int fd;
167         long aligned_addr;
168
169         RTE_LOG(INFO, EAL, "Ask a virtual area of 0x%"PRIx64" bytes\n", *size);
170
171         fd = open("/dev/zero", O_RDONLY);
172         if (fd < 0){
173                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
174                 return NULL;
175         }
176         do {
177                 addr = mmap(NULL, (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
178                 if (addr == MAP_FAILED)
179                         *size -= hugepage_sz;
180         } while (addr == MAP_FAILED && *size > 0);
181
182         if (addr == MAP_FAILED) {
183                 close(fd);
184                 RTE_LOG(INFO, EAL, "Cannot get a virtual area\n");
185                 return NULL;
186         }
187
188         munmap(addr, (*size) + hugepage_sz);
189         close(fd);
190
191         /* align addr to a huge page size boundary */
192         aligned_addr = (long)addr;
193         aligned_addr += (hugepage_sz - 1);
194         aligned_addr &= (~(hugepage_sz - 1));
195         addr = (void *)(aligned_addr);
196
197         RTE_LOG(INFO, EAL, "Virtual area found at %p (size = 0x%"PRIx64")\n",
198                 addr, *size);
199
200         return addr;
201 }
202
203 /*
204  * Mmap all hugepages of hugepage table: it first open a file in
205  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
206  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
207  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
208  * map continguous physical blocks in contiguous virtual blocks.
209  */
210 static int
211 map_all_hugepages(struct hugepage *hugepg_tbl,
212                 struct hugepage_info *hpi, int orig)
213 {
214         int fd;
215         unsigned i;
216         void *virtaddr;
217         void *vma_addr = NULL;
218         uint64_t vma_len = 0;
219
220         for (i = 0; i < hpi->num_pages[0]; i++) {
221                 uint64_t hugepage_sz = hpi->hugepage_sz;
222
223                 if (orig) {
224                         hugepg_tbl[i].file_id = i;
225                         hugepg_tbl[i].size = hugepage_sz;
226                         eal_get_hugefile_path(hugepg_tbl[i].filepath,
227                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
228                                         hugepg_tbl[i].file_id);
229                         hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
230                 }
231 #ifndef RTE_ARCH_X86_64
232                 /* for 32-bit systems, don't remap 1G pages, just reuse original
233                  * map address as final map address.
234                  */
235                 else if (hugepage_sz == RTE_PGSIZE_1G){
236                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
237                         hugepg_tbl[i].orig_va = NULL;
238                         continue;
239                 }
240 #endif
241                 else if (vma_len == 0) {
242                         unsigned j, num_pages;
243
244                         /* reserve a virtual area for next contiguous
245                          * physical block: count the number of
246                          * contiguous physical pages. */
247                         for (j = i+1; j < hpi->num_pages[0] ; j++) {
248                                 if (hugepg_tbl[j].physaddr !=
249                                     hugepg_tbl[j-1].physaddr + hugepage_sz)
250                                         break;
251                         }
252                         num_pages = j - i;
253                         vma_len = num_pages * hugepage_sz;
254
255                         /* get the biggest virtual memory area up to
256                          * vma_len. If it fails, vma_addr is NULL, so
257                          * let the kernel provide the address. */
258                         vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
259                         if (vma_addr == NULL)
260                                 vma_len = hugepage_sz;
261                 }
262
263                 /* try to create hugepage file */
264                 fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
265                 if (fd < 0) {
266                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
267                                         strerror(errno));
268                         return -1;
269                 }
270
271                 virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
272                                 MAP_SHARED, fd, 0);
273                 if (virtaddr == MAP_FAILED) {
274                         RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
275                                         strerror(errno));
276                         close(fd);
277                         return -1;
278                 }
279
280                 if (orig) {
281                         hugepg_tbl[i].orig_va = virtaddr;
282                         memset(virtaddr, 0, hugepage_sz);
283                 }
284                 else {
285                         hugepg_tbl[i].final_va = virtaddr;
286                 }
287
288                 /* close the file descriptor, files will be locked later */
289                 close(fd);
290
291                 vma_addr = (char *)vma_addr + hugepage_sz;
292                 vma_len -= hugepage_sz;
293         }
294         return 0;
295 }
296
297 /* Unmap all hugepages from original mapping. */
298 static int
299 unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
300 {
301         unsigned i;
302         for (i = 0; i < hpi->num_pages[0]; i++) {
303                 if (hugepg_tbl[i].orig_va) {
304                         munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
305                         hugepg_tbl[i].orig_va = NULL;
306                 }
307         }
308         return 0;
309 }
310
311 /*
312  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
313  * it by browsing the /proc/self/pagemap special file.
314  */
315 static int
316 find_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
317 {
318         int fd;
319         unsigned i;
320         uint64_t page;
321         unsigned long virt_pfn;
322         int page_size;
323
324         /* standard page size */
325         page_size = getpagesize();
326
327         fd = open("/proc/self/pagemap", O_RDONLY);
328         if (fd < 0) {
329                 RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
330                         __func__, strerror(errno));
331                 return -1;
332         }
333
334         for (i = 0; i < hpi->num_pages[0]; i++) {
335                 off_t offset;
336                 virt_pfn = (unsigned long)hugepg_tbl[i].orig_va /
337                         page_size;
338                 offset = sizeof(uint64_t) * virt_pfn;
339                 if (lseek(fd, offset, SEEK_SET) != offset){
340                         RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
341                                         __func__, strerror(errno));
342                         close(fd);
343                         return -1;
344                 }
345                 if (read(fd, &page, sizeof(uint64_t)) < 0) {
346                         RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
347                                         __func__, strerror(errno));
348                         close(fd);
349                         return -1;
350                 }
351
352                 /*
353                  * the pfn (page frame number) are bits 0-54 (see
354                  * pagemap.txt in linux Documentation)
355                  */
356                 hugepg_tbl[i].physaddr = ((page & 0x7fffffffffffffULL) * page_size);
357         }
358         close(fd);
359         return 0;
360 }
361
362 /*
363  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
364  * page.
365  */
366 static int
367 find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
368 {
369         int socket_id;
370         char *end, *nodestr;
371         unsigned i, hp_count = 0;
372         uint64_t virt_addr;
373         char buf[BUFSIZ];
374         char hugedir_str[PATH_MAX];
375         FILE *f;
376
377         f = fopen("/proc/self/numa_maps", "r");
378         if (f == NULL) {
379                 RTE_LOG(INFO, EAL, "cannot open /proc/self/numa_maps,"
380                                 " consider that all memory is in socket_id 0\n");
381                 return 0;
382         }
383
384         rte_snprintf(hugedir_str, sizeof(hugedir_str),
385                         "%s/", hpi->hugedir);
386
387         /* parse numa map */
388         while (fgets(buf, sizeof(buf), f) != NULL) {
389
390                 /* ignore non huge page */
391                 if (strstr(buf, " huge ") == NULL &&
392                                 strstr(buf, hugedir_str) == NULL)
393                         continue;
394
395                 /* get zone addr */
396                 virt_addr = strtoull(buf, &end, 16);
397                 if (virt_addr == 0 || end == buf) {
398                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
399                         goto error;
400                 }
401
402                 /* get node id (socket id) */
403                 nodestr = strstr(buf, " N");
404                 if (nodestr == NULL) {
405                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
406                         goto error;
407                 }
408                 nodestr += 2;
409                 end = strstr(nodestr, "=");
410                 if (end == NULL) {
411                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
412                         goto error;
413                 }
414                 end[0] = '\0';
415                 end = NULL;
416
417                 socket_id = strtoul(nodestr, &end, 0);
418                 if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
419                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
420                         goto error;
421                 }
422
423                 /* if we find this page in our mappings, set socket_id */
424                 for (i = 0; i < hpi->num_pages[0]; i++) {
425                         void *va = (void *)(unsigned long)virt_addr;
426                         if (hugepg_tbl[i].orig_va == va) {
427                                 hugepg_tbl[i].socket_id = socket_id;
428                                 hp_count++;
429                         }
430                 }
431         }
432
433         if (hp_count < hpi->num_pages[0])
434                 goto error;
435
436         fclose(f);
437         return 0;
438
439 error:
440         fclose(f);
441         return -1;
442 }
443
444 /*
445  * Sort the hugepg_tbl by physical address (lower addresses first). We
446  * use a slow algorithm, but we won't have millions of pages, and this
447  * is only done at init time.
448  */
449 static int
450 sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
451 {
452         unsigned i, j;
453         int smallest_idx;
454         uint64_t smallest_addr;
455         struct hugepage tmp;
456
457         for (i = 0; i < hpi->num_pages[0]; i++) {
458                 smallest_addr = 0;
459                 smallest_idx = -1;
460
461                 /*
462                  * browse all entries starting at 'i', and find the
463                  * entry with the smallest addr
464                  */
465                 for (j=i; j< hpi->num_pages[0]; j++) {
466
467                         if (smallest_addr == 0 ||
468                             hugepg_tbl[j].physaddr < smallest_addr) {
469                                 smallest_addr = hugepg_tbl[j].physaddr;
470                                 smallest_idx = j;
471                         }
472                 }
473
474                 /* should not happen */
475                 if (smallest_idx == -1) {
476                         RTE_LOG(ERR, EAL, "%s(): error in physaddr sorting\n", __func__);
477                         return -1;
478                 }
479
480                 /* swap the 2 entries in the table */
481                 memcpy(&tmp, &hugepg_tbl[smallest_idx], sizeof(struct hugepage));
482                 memcpy(&hugepg_tbl[smallest_idx], &hugepg_tbl[i],
483                                 sizeof(struct hugepage));
484                 memcpy(&hugepg_tbl[i], &tmp, sizeof(struct hugepage));
485         }
486         return 0;
487 }
488
489 /*
490  * Uses mmap to create a shared memory area for storage of data
491  * Used in this file to store the hugepage file map on disk
492  */
493 static void *
494 create_shared_memory(const char *filename, const size_t mem_size)
495 {
496         void *retval;
497         int fd = open(filename, O_CREAT | O_RDWR, 0666);
498         if (fd < 0)
499                 return NULL;
500         if (ftruncate(fd, mem_size) < 0) {
501                 close(fd);
502                 return NULL;
503         }
504         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
505         close(fd);
506         return retval;
507 }
508
509 /*
510  * this copies *active* hugepages from one hugepage table to another.
511  * destination is typically the shared memory.
512  */
513 static int
514 copy_hugepages_to_shared_mem(struct hugepage * dst, int dest_size,
515                 const struct hugepage * src, int src_size)
516 {
517         int src_pos, dst_pos = 0;
518
519         for (src_pos = 0; src_pos < src_size; src_pos++) {
520                 if (src[src_pos].final_va != NULL) {
521                         /* error on overflow attempt */
522                         if (dst_pos == dest_size)
523                                 return -1;
524                         memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage));
525                         dst_pos++;
526                 }
527         }
528         return 0;
529 }
530
531 /*
532  * unmaps hugepages that are not going to be used. since we originally allocate
533  * ALL hugepages (not just those we need), additional unmapping needs to be done.
534  */
535 static int
536 unmap_unneeded_hugepages(struct hugepage *hugepg_tbl,
537                 struct hugepage_info *hpi,
538                 unsigned num_hp_info)
539 {
540         unsigned socket, size;
541         int page, nrpages = 0;
542         int fd;
543
544         /* get total number of hugepages */
545         for (size = 0; size < num_hp_info; size++)
546                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
547                         nrpages += internal_config.hugepage_info[size].num_pages[socket];
548
549         for (size = 0; size < num_hp_info; size++) {
550                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
551                         unsigned pages_found = 0;
552                         /* traverse until we have unmapped all the unused pages */
553                         for (page = 0; page < nrpages; page++) {
554                                 struct hugepage *hp = &hugepg_tbl[page];
555
556                                 /* find a page that matches the criteria */
557                                 if ((hp->size == hpi[size].hugepage_sz) &&
558                                                 (hp->socket_id == (int) socket)) {
559
560                                         /* if we skipped enough pages, unmap the rest */
561                                         if (pages_found == hpi[size].num_pages[socket]) {
562                                                 munmap(hp->final_va, hp->size);
563                                                 hp->final_va = NULL;
564                                         }
565                                         /* lock the page and skip */
566                                         else {
567                                                 /* try and open the hugepage file */
568                                                 while ((fd = open(hp->filepath, O_CREAT | O_RDWR, 0755)) < 0) {
569                                                         /* if we can't open due to resource limits */
570                                                         if (errno == EMFILE) {
571                                                                 RTE_LOG(INFO, EAL, "Increasing open file limit\n");
572
573                                                                 /* if we manage to increase resource limit, try again */
574                                                                 if (increase_open_file_limit() == 0)
575                                                                         continue;
576                                                         }
577                                                         else
578                                                                 RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
579                                                                                 strerror(errno));
580                                                         return -1;
581                                                 }
582                                                 /* try and lock the hugepage */
583                                                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
584                                                         RTE_LOG(ERR, EAL, "Locking hugepage file failed!\n");
585                                                         close(fd);
586                                                         return -1;
587                                                 }
588                                                 hp->page_lock = fd;
589                                                 pages_found++;
590                                         }
591                                 } /* match page */
592                         } /* foreach page */
593                 } /* foreach socket */
594         } /* foreach pagesize */
595
596         return 0;
597 }
598
599 static inline uint64_t
600 get_socket_mem_size(int socket)
601 {
602         uint64_t size = 0;
603         unsigned i;
604
605         for (i = 0; i < internal_config.num_hugepage_sizes; i++){
606                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
607                 if (hpi->hugedir != NULL)
608                         size += hpi->hugepage_sz * hpi->num_pages[socket];
609         }
610
611         return (size);
612 }
613
614 /*
615  * This function is a NUMA-aware equivalent of calc_num_pages.
616  * It takes in the list of hugepage sizes and the
617  * number of pages thereof, and calculates the best number of
618  * pages of each size to fulfill the request for <memory> ram
619  */
620 static int
621 calc_num_pages_per_socket(uint64_t * memory,
622                 struct hugepage_info *hp_info,
623                 struct hugepage_info *hp_used,
624                 unsigned num_hp_info)
625 {
626         unsigned socket, j, i = 0;
627         unsigned requested, available;
628         int total_num_pages = 0;
629         uint64_t remaining_mem, cur_mem;
630         uint64_t total_mem = internal_config.memory;
631
632         if (num_hp_info == 0)
633                 return -1;
634
635         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
636                 /* if specific memory amounts per socket weren't requested */
637                 if (internal_config.force_sockets == 0) {
638                         /* take whatever is available */
639                         memory[socket] = RTE_MIN(get_socket_mem_size(socket),
640                                         total_mem);
641                 }
642                 /* skips if the memory on specific socket wasn't requested */
643                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
644                         hp_used[i].hugedir = hp_info[i].hugedir;
645                         hp_used[i].num_pages[socket] = RTE_MIN(
646                                         memory[socket] / hp_info[i].hugepage_sz,
647                                         hp_info[i].num_pages[socket]);
648
649                         cur_mem = hp_used[i].num_pages[socket] *
650                                         hp_used[i].hugepage_sz;
651
652                         memory[socket] -= cur_mem;
653                         total_mem -= cur_mem;
654
655                         total_num_pages += hp_used[i].num_pages[socket];
656
657                         /* check if we have met all memory requests */
658                         if (memory[socket] == 0)
659                                 break;
660
661                         /* check if we have any more pages left at this size, if so
662                          * move on to next size */
663                         if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
664                                 continue;
665                         /* At this point we know that there are more pages available that are
666                          * bigger than the memory we want, so lets see if we can get enough
667                          * from other page sizes.
668                          */
669                         remaining_mem = 0;
670                         for (j = i+1; j < num_hp_info; j++)
671                                 remaining_mem += hp_info[j].hugepage_sz *
672                                 hp_info[j].num_pages[socket];
673
674                         /* is there enough other memory, if not allocate another page and quit */
675                         if (remaining_mem < memory[socket]){
676                                 cur_mem = RTE_MIN(memory[socket],
677                                                 hp_info[i].hugepage_sz);
678                                 memory[socket] -= cur_mem;
679                                 total_mem -= cur_mem;
680                                 hp_used[i].num_pages[socket]++;
681                                 total_num_pages++;
682                                 break; /* we are done with this socket*/
683                         }
684                 }
685                 /* if we didn't satisfy all memory requirements per socket */
686                 if (memory[socket] > 0) {
687                         /* to prevent icc errors */
688                         requested = (unsigned) (internal_config.socket_mem[socket] /
689                                         0x100000);
690                         available = requested -
691                                         ((unsigned) (memory[socket] / 0x100000));
692                         RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! "
693                                         "Requested: %uMB, available: %uMB\n", socket,
694                                         requested, available);
695                         return -1;
696                 }
697         }
698
699         /* if we didn't satisfy total memory requirements */
700         if (total_mem > 0) {
701                 requested = (unsigned) (internal_config.memory / 0x100000);
702                 available = requested - (unsigned) (total_mem / 0x100000);
703                 RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB,"
704                                 " available: %uMB\n", requested, available);
705                 return -1;
706         }
707         return total_num_pages;
708 }
709
710 /*
711  * Prepare physical memory mapping: fill configuration structure with
712  * these infos, return 0 on success.
713  *  1. map N huge pages in separate files in hugetlbfs
714  *  2. find associated physical addr
715  *  3. find associated NUMA socket ID
716  *  4. sort all huge pages by physical address
717  *  5. remap these N huge pages in the correct order
718  *  6. unmap the first mapping
719  *  7. fill memsegs in configuration with contiguous zones
720  */
721 static int
722 rte_eal_hugepage_init(void)
723 {
724         struct rte_mem_config *mcfg;
725         struct hugepage *hugepage, *tmp_hp = NULL;
726         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
727
728         uint64_t memory[RTE_MAX_NUMA_NODES];
729
730         unsigned hp_offset;
731         int i, j, new_memseg;
732         int nrpages, total_pages = 0;
733         void *addr;
734
735         memset(used_hp, 0, sizeof(used_hp));
736
737         /* get pointer to global configuration */
738         mcfg = rte_eal_get_configuration()->mem_config;
739
740         /* for debug purposes, hugetlbfs can be disabled */
741         if (internal_config.no_hugetlbfs) {
742                 addr = malloc(internal_config.memory);
743                 mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
744                 mcfg->memseg[0].addr = addr;
745                 mcfg->memseg[0].len = internal_config.memory;
746                 mcfg->memseg[0].socket_id = 0;
747                 return 0;
748         }
749
750
751         /* calculate total number of hugepages available. at this point we haven't
752          * yet started sorting them so they all are on socket 0 */
753         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
754                 /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
755                 used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
756
757                 total_pages += internal_config.hugepage_info[i].num_pages[0];
758         }
759
760         /*
761          * allocate a memory area for hugepage table.
762          * this isn't shared memory yet. due to the fact that we need some
763          * processing done on these pages, shared memory will be created
764          * at a later stage.
765          */
766         tmp_hp = malloc(total_pages * sizeof(struct hugepage));
767         if (tmp_hp == NULL)
768                 goto fail;
769
770         memset(tmp_hp, 0, total_pages * sizeof(struct hugepage));
771
772         hp_offset = 0; /* where we start the current page size entries */
773
774         /* map all hugepages and sort them */
775         for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
776                 struct hugepage_info *hpi;
777
778                 /*
779                  * we don't yet mark hugepages as used at this stage, so
780                  * we just map all hugepages available to the system
781                  * all hugepages are still located on socket 0
782                  */
783                 hpi = &internal_config.hugepage_info[i];
784
785                 if (hpi->num_pages == 0)
786                         continue;
787
788                 /* map all hugepages available */
789                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
790                         RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
791                                         (unsigned)(hpi->hugepage_sz / 0x100000));
792                         goto fail;
793                 }
794
795                 /* find physical addresses and sockets for each hugepage */
796                 if (find_physaddr(&tmp_hp[hp_offset], hpi) < 0){
797                         RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
798                                         (unsigned)(hpi->hugepage_sz / 0x100000));
799                         goto fail;
800                 }
801
802                 if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
803                         RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
804                                         (unsigned)(hpi->hugepage_sz / 0x100000));
805                         goto fail;
806                 }
807
808                 if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)
809                         goto fail;
810
811                 /* remap all hugepages */
812                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
813                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
814                                         (unsigned)(hpi->hugepage_sz / 0x100000));
815                         goto fail;
816                 }
817
818                 /* unmap original mappings */
819                 if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
820                         goto fail;
821
822                 /* we have processed a num of hugepages of this size, so inc offset */
823                 hp_offset += hpi->num_pages[0];
824         }
825
826         /* clean out the numbers of pages */
827         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
828                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
829                         internal_config.hugepage_info[i].num_pages[j] = 0;
830
831         /* get hugepages for each socket */
832         for (i = 0; i < total_pages; i++) {
833                 int socket = tmp_hp[i].socket_id;
834
835                 /* find a hugepage info with right size and increment num_pages */
836                 for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) {
837                         if (tmp_hp[i].size ==
838                                         internal_config.hugepage_info[j].hugepage_sz) {
839                                 internal_config.hugepage_info[j].num_pages[socket]++;
840                         }
841                 }
842         }
843
844         /* make a copy of socket_mem, needed for number of pages calculation */
845         for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
846                 memory[i] = internal_config.socket_mem[i];
847
848         /* calculate final number of pages */
849         nrpages = calc_num_pages_per_socket(memory,
850                         internal_config.hugepage_info, used_hp,
851                         internal_config.num_hugepage_sizes);
852
853         /* error if not enough memory available */
854         if (nrpages < 0)
855                 goto fail;
856
857         /* reporting in! */
858         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
859                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
860                         if (used_hp[i].num_pages[j] > 0) {
861                                 RTE_LOG(INFO, EAL,
862                                                 "Requesting %u pages of size %uMB"
863                                                 " from socket %i\n",
864                                                 used_hp[i].num_pages[j],
865                                                 (unsigned)
866                                                         (used_hp[i].hugepage_sz / 0x100000),
867                                                 j);
868                         }
869                 }
870         }
871
872         /* create shared memory */
873         hugepage = create_shared_memory(eal_hugepage_info_path(),
874                                         nrpages * sizeof(struct hugepage));
875
876         if (hugepage == NULL) {
877                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
878                 goto fail;
879         }
880
881         /*
882          * unmap pages that we won't need (looks at used_hp).
883          * also, sets final_va to NULL on pages that were unmapped.
884          */
885         if (unmap_unneeded_hugepages(tmp_hp, used_hp,
886                         internal_config.num_hugepage_sizes) < 0) {
887                 RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
888                 goto fail;
889         }
890
891         /*
892          * copy stuff from malloc'd hugepage* to the actual shared memory.
893          * this procedure only copies those hugepages that have final_va
894          * not NULL. has overflow protection.
895          */
896         if (copy_hugepages_to_shared_mem(hugepage, nrpages,
897                         tmp_hp, total_pages) < 0) {
898                 RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
899                 goto fail;
900         }
901
902         /* free the temporary hugepage table */
903         free(tmp_hp);
904         tmp_hp = NULL;
905
906         memset(mcfg->memseg, 0, sizeof(mcfg->memseg));
907         j = -1;
908         for (i = 0; i < nrpages; i++) {
909                 new_memseg = 0;
910
911                 /* if this is a new section, create a new memseg */
912                 if (i == 0)
913                         new_memseg = 1;
914                 else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
915                         new_memseg = 1;
916                 else if (hugepage[i].size != hugepage[i-1].size)
917                         new_memseg = 1;
918                 else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
919                     hugepage[i].size)
920                         new_memseg = 1;
921                 else if (((unsigned long)hugepage[i].final_va -
922                     (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
923                         new_memseg = 1;
924
925                 if (new_memseg) {
926                         j += 1;
927                         if (j == RTE_MAX_MEMSEG)
928                                 break;
929
930                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
931                         mcfg->memseg[j].addr = hugepage[i].final_va;
932                         mcfg->memseg[j].len = hugepage[i].size;
933                         mcfg->memseg[j].socket_id = hugepage[i].socket_id;
934                         mcfg->memseg[j].hugepage_sz = hugepage[i].size;
935                 }
936                 /* continuation of previous memseg */
937                 else {
938                         mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
939                 }
940                 hugepage[i].memseg_id = j;
941         }
942
943         return 0;
944
945
946 fail:
947         if (tmp_hp)
948                 free(tmp_hp);
949         return -1;
950 }
951
952 /*
953  * uses fstat to report the size of a file on disk
954  */
955 static off_t
956 getFileSize(int fd)
957 {
958         struct stat st;
959         if (fstat(fd, &st) < 0)
960                 return 0;
961         return st.st_size;
962 }
963
964 /*
965  * This creates the memory mappings in the secondary process to match that of
966  * the server process. It goes through each memory segment in the DPDK runtime
967  * configuration and finds the hugepages which form that segment, mapping them
968  * in order to form a contiguous block in the virtual memory space
969  */
970 static int
971 rte_eal_hugepage_attach(void)
972 {
973         const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
974         const struct hugepage *hp = NULL;
975         unsigned num_hp = 0;
976         unsigned i, s = 0; /* s used to track the segment number */
977         off_t size;
978         int fd, fd_zero = -1, fd_hugepage = -1;
979
980         if (aslr_enabled() > 0) {
981                 RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
982                                 "(ASLR) is enabled in the kernel.\n");
983                 RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
984                                 "into secondary processes\n");
985         }
986
987         fd_zero = open("/dev/zero", O_RDONLY);
988         if (fd_zero < 0) {
989                 RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
990                 goto error;
991         }
992         fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
993         if (fd_hugepage < 0) {
994                 RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
995                 goto error;
996         }
997
998         size = getFileSize(fd_hugepage);
999         hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1000         if (hp == NULL) {
1001                 RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
1002                 goto error;
1003         }
1004
1005         num_hp = size / sizeof(struct hugepage);
1006         RTE_LOG(DEBUG, EAL, "Analysing %u hugepages\n", num_hp);
1007
1008         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
1009                 void *addr, *base_addr;
1010                 uintptr_t offset = 0;
1011
1012                 /* fdzero is mmapped to get a contiguous block of virtual addresses
1013                  * get a block of free memory of the appropriate size -
1014                  * use mmap to attempt to get an identical address as server.
1015                  */
1016                 base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
1017                                 PROT_READ, MAP_PRIVATE, fd_zero, 0);
1018                 if (base_addr == MAP_FAILED || base_addr != mcfg->memseg[s].addr) {
1019                         RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
1020                                 "in /dev/zero to requested address [%p]\n",
1021                                 (unsigned long long)mcfg->memseg[s].len,
1022                                 mcfg->memseg[s].addr);
1023                         if (aslr_enabled() > 0)
1024                                 RTE_LOG(ERR, EAL, "It is recommended to disable ASLR in the kernel "
1025                                                 "and retry running both primary and secondary processes\n");
1026                         goto error;
1027                 }
1028                 /* free memory so we can map the hugepages into the space */
1029                 munmap(base_addr, mcfg->memseg[s].len);
1030
1031                 /* find the hugepages for this segment and map them
1032                  * we don't need to worry about order, as the server sorted the
1033                  * entries before it did the second mmap of them */
1034                 for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
1035                         if (hp[i].memseg_id == (int)s){
1036                                 fd = open(hp[i].filepath, O_RDWR);
1037                                 if (fd < 0) {
1038                                         RTE_LOG(ERR, EAL, "Could not open %s\n",
1039                                                 hp[i].filepath);
1040                                         goto error;
1041                                 }
1042                                 addr = mmap(RTE_PTR_ADD(base_addr, offset),
1043                                                 hp[i].size, PROT_READ | PROT_WRITE,
1044                                                 MAP_SHARED | MAP_FIXED, fd, 0);
1045                                 close(fd); /* close file both on success and on failure */
1046                                 if (addr == MAP_FAILED) {
1047                                         RTE_LOG(ERR, EAL, "Could not mmap %s\n",
1048                                                 hp[i].filepath);
1049                                         goto error;
1050                                 }
1051                                 offset+=hp[i].size;
1052                         }
1053                 }
1054                 RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
1055                                 (unsigned long long)mcfg->memseg[s].len);
1056                 s++;
1057         }
1058         close(fd_zero);
1059         close(fd_hugepage);
1060         return 0;
1061
1062 error:
1063         if (fd_zero >= 0)
1064                 close(fd_zero);
1065         if (fd_hugepage >= 0)
1066                 close(fd_hugepage);
1067         return -1;
1068 }
1069
1070 static int
1071 rte_eal_memdevice_init(void)
1072 {
1073         struct rte_config *config;
1074
1075         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1076                 return 0;
1077
1078         config = rte_eal_get_configuration();
1079         config->mem_config->nchannel = internal_config.force_nchannel;
1080         config->mem_config->nrank = internal_config.force_nrank;
1081
1082         return 0;
1083 }
1084
1085
1086 /* init memory subsystem */
1087 int
1088 rte_eal_memory_init(void)
1089 {
1090         RTE_LOG(INFO, EAL, "Setting up hugepage memory...\n");
1091         const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1092                         rte_eal_hugepage_init() :
1093                         rte_eal_hugepage_attach();
1094         if (retval < 0)
1095                 return -1;
1096
1097         if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
1098                 return -1;
1099
1100         return 0;
1101 }