7438b8f18f1e582c1a0b66337b0b2aaf09568bfa
[dpdk.git] / lib / librte_eal / linuxapp / eal / eal_memory.c
1 /*-
2  *   BSD LICENSE
3  * 
4  *   Copyright(c) 2010-2013 Intel Corporation. All rights reserved.
5  *   All rights reserved.
6  * 
7  *   Redistribution and use in source and binary forms, with or without 
8  *   modification, are permitted provided that the following conditions 
9  *   are met:
10  * 
11  *     * Redistributions of source code must retain the above copyright 
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright 
14  *       notice, this list of conditions and the following disclaimer in 
15  *       the documentation and/or other materials provided with the 
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its 
18  *       contributors may be used to endorse or promote products derived 
19  *       from this software without specific prior written permission.
20  * 
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  * 
33  */
34 /*   BSD LICENSE
35  *
36  *   Copyright(c) 2013 6WIND.
37  *
38  *   Redistribution and use in source and binary forms, with or without
39  *   modification, are permitted provided that the following conditions
40  *   are met:
41  *
42  *     * Redistributions of source code must retain the above copyright
43  *       notice, this list of conditions and the following disclaimer.
44  *     * Redistributions in binary form must reproduce the above copyright
45  *       notice, this list of conditions and the following disclaimer in
46  *       the documentation and/or other materials provided with the
47  *       distribution.
48  *     * Neither the name of 6WIND S.A. nor the names of its
49  *       contributors may be used to endorse or promote products derived
50  *       from this software without specific prior written permission.
51  *
52  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
53  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
54  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
55  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
56  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
57  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
58  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
59  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
60  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
61  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
62  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
63  */
64
65 #include <errno.h>
66 #include <stdarg.h>
67 #include <stdlib.h>
68 #include <stdio.h>
69 #include <stdint.h>
70 #include <inttypes.h>
71 #include <string.h>
72 #include <stdarg.h>
73 #include <sys/mman.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <sys/queue.h>
77 #include <sys/file.h>
78 #include <unistd.h>
79 #include <limits.h>
80 #include <errno.h>
81 #include <sys/ioctl.h>
82 #include <sys/time.h>
83
84 #include <rte_log.h>
85 #include <rte_memory.h>
86 #include <rte_memzone.h>
87 #include <rte_launch.h>
88 #include <rte_tailq.h>
89 #include <rte_eal.h>
90 #include <rte_eal_memconfig.h>
91 #include <rte_per_lcore.h>
92 #include <rte_lcore.h>
93 #include <rte_common.h>
94 #include <rte_string_fns.h>
95
96 #include "eal_private.h"
97 #include "eal_internal_cfg.h"
98 #include "eal_filesystem.h"
99 #include "eal_hugepages.h"
100
101 /**
102  * @file
103  * Huge page mapping under linux
104  *
105  * To reserve a big contiguous amount of memory, we use the hugepage
106  * feature of linux. For that, we need to have hugetlbfs mounted. This
107  * code will create many files in this directory (one per page) and
108  * map them in virtual memory. For each page, we will retrieve its
109  * physical address and remap it in order to have a virtual contiguous
110  * zone as well as a physical contiguous zone.
111  */
112
113
114 #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
115
116 /*
117  * Check whether address-space layout randomization is enabled in
118  * the kernel. This is important for multi-process as it can prevent
119  * two processes mapping data to the same virtual address
120  * Returns:
121  *    0 - address space randomization disabled
122  *    1/2 - address space randomization enabled
123  *    negative error code on error
124  */
125 static int
126 aslr_enabled(void)
127 {
128         char c;
129         int retval, fd = open(RANDOMIZE_VA_SPACE_FILE, O_RDONLY);
130         if (fd < 0)
131                 return -errno;
132         retval = read(fd, &c, 1);
133         close(fd);
134         if (retval < 0)
135                 return -errno;
136         if (retval == 0)
137                 return -EIO;
138         switch (c) {
139                 case '0' : return 0;
140                 case '1' : return 1;
141                 case '2' : return 2;
142                 default: return -EINVAL;
143         }
144 }
145
146 /*
147  * Try to mmap *size bytes in /dev/zero. If it is succesful, return the
148  * pointer to the mmap'd area and keep *size unmodified. Else, retry
149  * with a smaller zone: decrease *size by hugepage_sz until it reaches
150  * 0. In this case, return NULL. Note: this function returns an address
151  * which is a multiple of hugepage size.
152  */
153 static void *
154 get_virtual_area(size_t *size, size_t hugepage_sz)
155 {
156         void *addr;
157         int fd;
158         long aligned_addr;
159
160         RTE_LOG(INFO, EAL, "Ask a virtual area of 0x%zu bytes\n", *size);
161
162         fd = open("/dev/zero", O_RDONLY);
163         if (fd < 0){
164                 RTE_LOG(ERR, EAL, "Cannot open /dev/zero\n");
165                 return NULL;
166         }
167         do {
168                 addr = mmap(NULL, (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE, fd, 0);
169                 if (addr == MAP_FAILED)
170                         *size -= hugepage_sz;
171         } while (addr == MAP_FAILED && *size > 0);
172
173         if (addr == MAP_FAILED) {
174                 close(fd);
175                 RTE_LOG(INFO, EAL, "Cannot get a virtual area\n");
176                 return NULL;
177         }
178
179         munmap(addr, (*size) + hugepage_sz);
180         close(fd);
181
182         /* align addr to a huge page size boundary */
183         aligned_addr = (long)addr;
184         aligned_addr += (hugepage_sz - 1);
185         aligned_addr &= (~(hugepage_sz - 1));
186         addr = (void *)(aligned_addr);
187
188         RTE_LOG(INFO, EAL, "Virtual area found at %p (size = 0x%zx)\n",
189                 addr, *size);
190
191         return addr;
192 }
193
194 /*
195  * Mmap all hugepages of hugepage table: it first open a file in
196  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
197  * virtual address is stored in hugepg_tbl[i].orig_va, else it is stored
198  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
199  * map continguous physical blocks in contiguous virtual blocks.
200  */
201 static int
202 map_all_hugepages(struct hugepage *hugepg_tbl,
203                 struct hugepage_info *hpi, int orig)
204 {
205         int fd;
206         unsigned i;
207         void *virtaddr;
208         void *vma_addr = NULL;
209         size_t vma_len = 0;
210
211         for (i = 0; i < hpi->num_pages[0]; i++) {
212                 size_t hugepage_sz = hpi->hugepage_sz;
213
214                 if (orig) {
215                         hugepg_tbl[i].file_id = i;
216                         hugepg_tbl[i].size = hugepage_sz;
217                         eal_get_hugefile_path(hugepg_tbl[i].filepath,
218                                         sizeof(hugepg_tbl[i].filepath), hpi->hugedir,
219                                         hugepg_tbl[i].file_id);
220                         hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0';
221                 }
222 #ifndef RTE_ARCH_X86_64
223                 /* for 32-bit systems, don't remap 1G pages, just reuse original
224                  * map address as final map address.
225                  */
226                 else if (hugepage_sz == RTE_PGSIZE_1G){
227                         hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va;
228                         hugepg_tbl[i].orig_va = NULL;
229                         continue;
230                 }
231 #endif
232                 else if (vma_len == 0) {
233                         unsigned j, num_pages;
234
235                         /* reserve a virtual area for next contiguous
236                          * physical block: count the number of
237                          * contiguous physical pages. */
238                         for (j = i+1; j < hpi->num_pages[0] ; j++) {
239                                 if (hugepg_tbl[j].physaddr !=
240                                     hugepg_tbl[j-1].physaddr + hugepage_sz)
241                                         break;
242                         }
243                         num_pages = j - i;
244                         vma_len = num_pages * hugepage_sz;
245
246                         /* get the biggest virtual memory area up to
247                          * vma_len. If it fails, vma_addr is NULL, so
248                          * let the kernel provide the address. */
249                         vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
250                         if (vma_addr == NULL)
251                                 vma_len = hugepage_sz;
252                 }
253
254                 /* try to create hugepage file */
255                 fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
256                 if (fd < 0) {
257                         RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
258                                         strerror(errno));
259                         return -1;
260                 }
261
262                 virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
263                                 MAP_SHARED, fd, 0);
264                 if (virtaddr == MAP_FAILED) {
265                         RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
266                                         strerror(errno));
267                         close(fd);
268                         return -1;
269                 }
270
271                 if (orig) {
272                         hugepg_tbl[i].orig_va = virtaddr;
273                         memset(virtaddr, 0, hugepage_sz);
274                 }
275                 else {
276                         hugepg_tbl[i].final_va = virtaddr;
277                 }
278
279                 /* set shared flock on the file. */
280                 if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
281                         RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
282                                 __func__, strerror(errno));
283                         close(fd);
284                         return -1;
285                 }
286
287                 close(fd);
288
289                 vma_addr = (char *)vma_addr + hugepage_sz;
290                 vma_len -= hugepage_sz;
291         }
292         return 0;
293 }
294
295 /* Unmap all hugepages from original mapping. */
296 static int
297 unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
298 {
299         unsigned i;
300         for (i = 0; i < hpi->num_pages[0]; i++) {
301                 if (hugepg_tbl[i].orig_va) {
302                         munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz);
303                         hugepg_tbl[i].orig_va = NULL;
304                 }
305         }
306         return 0;
307 }
308
309 /*
310  * For each hugepage in hugepg_tbl, fill the physaddr value. We find
311  * it by browsing the /proc/self/pagemap special file.
312  */
313 static int
314 find_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
315 {
316         int fd;
317         unsigned i;
318         uint64_t page;
319         unsigned long virt_pfn;
320         int page_size;
321
322         /* standard page size */
323         page_size = getpagesize();
324
325         fd = open("/proc/self/pagemap", O_RDONLY);
326         if (fd < 0) {
327                 RTE_LOG(ERR, EAL, "%s(): cannot open /proc/self/pagemap: %s\n",
328                         __func__, strerror(errno));
329                 return -1;
330         }
331
332         for (i = 0; i < hpi->num_pages[0]; i++) {
333                 off_t offset;
334                 virt_pfn = (unsigned long)hugepg_tbl[i].orig_va /
335                         page_size;
336                 offset = sizeof(uint64_t) * virt_pfn;
337                 if (lseek(fd, offset, SEEK_SET) == (off_t) -1) {
338                         RTE_LOG(ERR, EAL, "%s(): seek error in /proc/self/pagemap: %s\n",
339                                         __func__, strerror(errno));
340                         close(fd);
341                         return -1;
342                 }
343                 if (read(fd, &page, sizeof(uint64_t)) < 0) {
344                         RTE_LOG(ERR, EAL, "%s(): cannot read /proc/self/pagemap: %s\n",
345                                         __func__, strerror(errno));
346                         close(fd);
347                         return -1;
348                 }
349
350                 /*
351                  * the pfn (page frame number) are bits 0-54 (see
352                  * pagemap.txt in linux Documentation)
353                  */
354                 hugepg_tbl[i].physaddr = ((page & 0x7fffffffffffffULL) * page_size);
355         }
356         close(fd);
357         return 0;
358 }
359
360 /*
361  * Parse /proc/self/numa_maps to get the NUMA socket ID for each huge
362  * page.
363  */
364 static int
365 find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
366 {
367         int socket_id;
368         char *end, *nodestr;
369         unsigned i, hp_count = 0;
370         uint64_t virt_addr;
371         char buf[BUFSIZ];
372         char hugedir_str[PATH_MAX];
373         FILE *f;
374
375         f = fopen("/proc/self/numa_maps", "r");
376         if (f == NULL) {
377                 RTE_LOG(INFO, EAL, "cannot open /proc/self/numa_maps,"
378                                 " consider that all memory is in socket_id 0\n");
379                 return 0;
380         }
381
382         rte_snprintf(hugedir_str, sizeof(hugedir_str),
383                         "%s/", hpi->hugedir);
384
385         /* parse numa map */
386         while (fgets(buf, sizeof(buf), f) != NULL) {
387
388                 /* ignore non huge page */
389                 if (strstr(buf, " huge ") == NULL &&
390                                 strstr(buf, hugedir_str) == NULL)
391                         continue;
392
393                 /* get zone addr */
394                 virt_addr = strtoull(buf, &end, 16);
395                 if (virt_addr == 0 || end == buf) {
396                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
397                         goto error;
398                 }
399
400                 /* get node id (socket id) */
401                 nodestr = strstr(buf, " N");
402                 if (nodestr == NULL) {
403                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
404                         goto error;
405                 }
406                 nodestr += 2;
407                 end = strstr(nodestr, "=");
408                 if (end == NULL) {
409                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
410                         goto error;
411                 }
412                 end[0] = '\0';
413                 end = NULL;
414
415                 socket_id = strtoul(nodestr, &end, 0);
416                 if ((nodestr[0] == '\0') || (end == NULL) || (*end != '\0')) {
417                         RTE_LOG(ERR, EAL, "%s(): error in numa_maps parsing\n", __func__);
418                         goto error;
419                 }
420
421                 /* if we find this page in our mappings, set socket_id */
422                 for (i = 0; i < hpi->num_pages[0]; i++) {
423                         void *va = (void *)(unsigned long)virt_addr;
424                         if (hugepg_tbl[i].orig_va == va) {
425                                 hugepg_tbl[i].socket_id = socket_id;
426                                 hp_count++;
427                         }
428                 }
429         }
430
431         if (hp_count < hpi->num_pages[0])
432                 goto error;
433
434         fclose(f);
435         return 0;
436
437 error:
438         fclose(f);
439         return -1;
440 }
441
442 /*
443  * Sort the hugepg_tbl by physical address (lower addresses first). We
444  * use a slow algorithm, but we won't have millions of pages, and this
445  * is only done at init time.
446  */
447 static int
448 sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi)
449 {
450         unsigned i, j;
451         int smallest_idx;
452         uint64_t smallest_addr;
453         struct hugepage tmp;
454
455         for (i = 0; i < hpi->num_pages[0]; i++) {
456                 smallest_addr = 0;
457                 smallest_idx = -1;
458
459                 /*
460                  * browse all entries starting at 'i', and find the
461                  * entry with the smallest addr
462                  */
463                 for (j=i; j< hpi->num_pages[0]; j++) {
464
465                         if (smallest_addr == 0 ||
466                             hugepg_tbl[j].physaddr < smallest_addr) {
467                                 smallest_addr = hugepg_tbl[j].physaddr;
468                                 smallest_idx = j;
469                         }
470                 }
471
472                 /* should not happen */
473                 if (smallest_idx == -1) {
474                         RTE_LOG(ERR, EAL, "%s(): error in physaddr sorting\n", __func__);
475                         return -1;
476                 }
477
478                 /* swap the 2 entries in the table */
479                 memcpy(&tmp, &hugepg_tbl[smallest_idx], sizeof(struct hugepage));
480                 memcpy(&hugepg_tbl[smallest_idx], &hugepg_tbl[i],
481                                 sizeof(struct hugepage));
482                 memcpy(&hugepg_tbl[i], &tmp, sizeof(struct hugepage));
483         }
484         return 0;
485 }
486
487 /*
488  * Uses mmap to create a shared memory area for storage of data
489  * Used in this file to store the hugepage file map on disk
490  */
491 static void *
492 create_shared_memory(const char *filename, const size_t mem_size)
493 {
494         void *retval;
495         int fd = open(filename, O_CREAT | O_RDWR, 0666);
496         if (fd < 0)
497                 return NULL;
498         if (ftruncate(fd, mem_size) < 0) {
499                 close(fd);
500                 return NULL;
501         }
502         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
503         close(fd);
504         return retval;
505 }
506
507 /*
508  * this copies *active* hugepages from one hugepage table to another.
509  * destination is typically the shared memory.
510  */
511 static int
512 copy_hugepages_to_shared_mem(struct hugepage * dst, int dest_size,
513                 const struct hugepage * src, int src_size)
514 {
515         int src_pos, dst_pos = 0;
516
517         for (src_pos = 0; src_pos < src_size; src_pos++) {
518                 if (src[src_pos].final_va != NULL) {
519                         /* error on overflow attempt */
520                         if (dst_pos == dest_size)
521                                 return -1;
522                         memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage));
523                         dst_pos++;
524                 }
525         }
526         return 0;
527 }
528
529 /*
530  * unmaps hugepages that are not going to be used. since we originally allocate
531  * ALL hugepages (not just those we need), additional unmapping needs to be done.
532  */
533 static int
534 unmap_unneeded_hugepages(struct hugepage *hugepg_tbl,
535                 struct hugepage_info *hpi,
536                 unsigned num_hp_info)
537 {
538         unsigned socket, size;
539         int page, nrpages = 0;
540
541         /* get total number of hugepages */
542         for (size = 0; size < num_hp_info; size++)
543                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++)
544                         nrpages += internal_config.hugepage_info[size].num_pages[socket];
545
546         for (size = 0; size < num_hp_info; size++) {
547                 for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) {
548                         unsigned pages_found = 0;
549                         /* traverse until we have unmapped all the unused pages */
550                         for (page = 0; page < nrpages; page++) {
551                                 struct hugepage *hp = &hugepg_tbl[page];
552
553                                 /* find a page that matches the criteria */
554                                 if ((hp->size == hpi[size].hugepage_sz) &&
555                                                 (hp->socket_id == (int) socket)) {
556
557                                         /* if we skipped enough pages, unmap the rest */
558                                         if (pages_found == hpi[size].num_pages[socket]) {
559                                                 munmap(hp->final_va, hp->size);
560                                                 hp->final_va = NULL;
561                                         }
562                                         /* lock the page and skip */
563                                         else
564                                                 pages_found++;
565
566                                 } /* match page */
567                         } /* foreach page */
568                 } /* foreach socket */
569         } /* foreach pagesize */
570
571         return 0;
572 }
573
574 static inline uint64_t
575 get_socket_mem_size(int socket)
576 {
577         uint64_t size = 0;
578         unsigned i;
579
580         for (i = 0; i < internal_config.num_hugepage_sizes; i++){
581                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
582                 if (hpi->hugedir != NULL)
583                         size += hpi->hugepage_sz * hpi->num_pages[socket];
584         }
585
586         return (size);
587 }
588
589 /*
590  * This function is a NUMA-aware equivalent of calc_num_pages.
591  * It takes in the list of hugepage sizes and the
592  * number of pages thereof, and calculates the best number of
593  * pages of each size to fulfill the request for <memory> ram
594  */
595 static int
596 calc_num_pages_per_socket(uint64_t * memory,
597                 struct hugepage_info *hp_info,
598                 struct hugepage_info *hp_used,
599                 unsigned num_hp_info)
600 {
601         unsigned socket, j, i = 0;
602         unsigned requested, available;
603         int total_num_pages = 0;
604         uint64_t remaining_mem, cur_mem;
605         uint64_t total_mem = internal_config.memory;
606
607         if (num_hp_info == 0)
608                 return -1;
609
610         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) {
611                 /* if specific memory amounts per socket weren't requested */
612                 if (internal_config.force_sockets == 0) {
613                         /* take whatever is available */
614                         memory[socket] = RTE_MIN(get_socket_mem_size(socket),
615                                         total_mem);
616                 }
617                 /* skips if the memory on specific socket wasn't requested */
618                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++){
619                         hp_used[i].hugedir = hp_info[i].hugedir;
620                         hp_used[i].num_pages[socket] = RTE_MIN(
621                                         memory[socket] / hp_info[i].hugepage_sz,
622                                         hp_info[i].num_pages[socket]);
623
624                         cur_mem = hp_used[i].num_pages[socket] *
625                                         hp_used[i].hugepage_sz;
626
627                         memory[socket] -= cur_mem;
628                         total_mem -= cur_mem;
629
630                         total_num_pages += hp_used[i].num_pages[socket];
631
632                         /* check if we have met all memory requests */
633                         if (memory[socket] == 0)
634                                 break;
635
636                         /* check if we have any more pages left at this size, if so
637                          * move on to next size */
638                         if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket])
639                                 continue;
640                         /* At this point we know that there are more pages available that are
641                          * bigger than the memory we want, so lets see if we can get enough
642                          * from other page sizes.
643                          */
644                         remaining_mem = 0;
645                         for (j = i+1; j < num_hp_info; j++)
646                                 remaining_mem += hp_info[j].hugepage_sz *
647                                 hp_info[j].num_pages[socket];
648
649                         /* is there enough other memory, if not allocate another page and quit */
650                         if (remaining_mem < memory[socket]){
651                                 cur_mem = RTE_MIN(memory[socket],
652                                                 hp_info[i].hugepage_sz);
653                                 memory[socket] -= cur_mem;
654                                 total_mem -= cur_mem;
655                                 hp_used[i].num_pages[socket]++;
656                                 total_num_pages++;
657                                 break; /* we are done with this socket*/
658                         }
659                 }
660                 /* if we didn't satisfy all memory requirements per socket */
661                 if (memory[socket] > 0) {
662                         /* to prevent icc errors */
663                         requested = (unsigned) (internal_config.socket_mem[socket] /
664                                         0x100000);
665                         available = requested -
666                                         ((unsigned) (memory[socket] / 0x100000));
667                         RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! "
668                                         "Requested: %uMB, available: %uMB\n", socket,
669                                         requested, available);
670                         return -1;
671                 }
672         }
673
674         /* if we didn't satisfy total memory requirements */
675         if (total_mem > 0) {
676                 requested = (unsigned) (internal_config.memory / 0x100000);
677                 available = requested - (unsigned) (total_mem / 0x100000);
678                 RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB,"
679                                 " available: %uMB\n", requested, available);
680                 return -1;
681         }
682         return total_num_pages;
683 }
684
685 /*
686  * Prepare physical memory mapping: fill configuration structure with
687  * these infos, return 0 on success.
688  *  1. map N huge pages in separate files in hugetlbfs
689  *  2. find associated physical addr
690  *  3. find associated NUMA socket ID
691  *  4. sort all huge pages by physical address
692  *  5. remap these N huge pages in the correct order
693  *  6. unmap the first mapping
694  *  7. fill memsegs in configuration with contiguous zones
695  */
696 static int
697 rte_eal_hugepage_init(void)
698 {
699         struct rte_mem_config *mcfg;
700         struct hugepage *hugepage, *tmp_hp = NULL;
701         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
702
703         uint64_t memory[RTE_MAX_NUMA_NODES];
704
705         unsigned hp_offset;
706         int i, j, new_memseg;
707         int nrpages, total_pages = 0;
708         void *addr;
709
710         memset(used_hp, 0, sizeof(used_hp));
711
712         /* get pointer to global configuration */
713         mcfg = rte_eal_get_configuration()->mem_config;
714
715         /* for debug purposes, hugetlbfs can be disabled */
716         if (internal_config.no_hugetlbfs) {
717                 addr = malloc(internal_config.memory);
718                 mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr;
719                 mcfg->memseg[0].addr = addr;
720                 mcfg->memseg[0].len = internal_config.memory;
721                 mcfg->memseg[0].socket_id = 0;
722                 return 0;
723         }
724
725
726         /* calculate total number of hugepages available. at this point we haven't
727          * yet started sorting them so they all are on socket 0 */
728         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
729                 /* meanwhile, also initialize used_hp hugepage sizes in used_hp */
730                 used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz;
731
732                 total_pages += internal_config.hugepage_info[i].num_pages[0];
733         }
734
735         /*
736          * allocate a memory area for hugepage table.
737          * this isn't shared memory yet. due to the fact that we need some
738          * processing done on these pages, shared memory will be created
739          * at a later stage.
740          */
741         tmp_hp = malloc(total_pages * sizeof(struct hugepage));
742         if (tmp_hp == NULL)
743                 goto fail;
744
745         memset(tmp_hp, 0, total_pages * sizeof(struct hugepage));
746
747         hp_offset = 0; /* where we start the current page size entries */
748
749         /* map all hugepages and sort them */
750         for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
751                 struct hugepage_info *hpi;
752
753                 /*
754                  * we don't yet mark hugepages as used at this stage, so
755                  * we just map all hugepages available to the system
756                  * all hugepages are still located on socket 0
757                  */
758                 hpi = &internal_config.hugepage_info[i];
759
760                 if (hpi->num_pages == 0)
761                         continue;
762
763                 /* map all hugepages available */
764                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
765                         RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
766                                         (unsigned)(hpi->hugepage_sz / 0x100000));
767                         goto fail;
768                 }
769
770                 /* find physical addresses and sockets for each hugepage */
771                 if (find_physaddr(&tmp_hp[hp_offset], hpi) < 0){
772                         RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n",
773                                         (unsigned)(hpi->hugepage_sz / 0x100000));
774                         goto fail;
775                 }
776
777                 if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){
778                         RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n",
779                                         (unsigned)(hpi->hugepage_sz / 0x100000));
780                         goto fail;
781                 }
782
783                 if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0)
784                         goto fail;
785
786                 /* remap all hugepages */
787                 if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
788                         RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
789                                         (unsigned)(hpi->hugepage_sz / 0x100000));
790                         goto fail;
791                 }
792
793                 /* unmap original mappings */
794                 if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0)
795                         goto fail;
796
797                 /* we have processed a num of hugepages of this size, so inc offset */
798                 hp_offset += hpi->num_pages[0];
799         }
800
801         /* clean out the numbers of pages */
802         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++)
803                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
804                         internal_config.hugepage_info[i].num_pages[j] = 0;
805
806         /* get hugepages for each socket */
807         for (i = 0; i < total_pages; i++) {
808                 int socket = tmp_hp[i].socket_id;
809
810                 /* find a hugepage info with right size and increment num_pages */
811                 for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) {
812                         if (tmp_hp[i].size ==
813                                         internal_config.hugepage_info[j].hugepage_sz) {
814                                 internal_config.hugepage_info[j].num_pages[socket]++;
815                         }
816                 }
817         }
818
819         /* make a copy of socket_mem, needed for number of pages calculation */
820         for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
821                 memory[i] = internal_config.socket_mem[i];
822
823         /* calculate final number of pages */
824         nrpages = calc_num_pages_per_socket(memory,
825                         internal_config.hugepage_info, used_hp,
826                         internal_config.num_hugepage_sizes);
827
828         /* error if not enough memory available */
829         if (nrpages < 0)
830                 goto fail;
831
832         /* reporting in! */
833         for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
834                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
835                         if (used_hp[i].num_pages[j] > 0) {
836                                 RTE_LOG(INFO, EAL,
837                                                 "Requesting %u pages of size %uMB"
838                                                 " from socket %i\n",
839                                                 used_hp[i].num_pages[j],
840                                                 (unsigned)
841                                                         (used_hp[i].hugepage_sz / 0x100000),
842                                                 j);
843                         }
844                 }
845         }
846
847         /* create shared memory */
848         hugepage = create_shared_memory(eal_hugepage_info_path(),
849                                         nrpages * sizeof(struct hugepage));
850
851         if (hugepage == NULL) {
852                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
853                 goto fail;
854         }
855
856         /*
857          * unmap pages that we won't need (looks at used_hp).
858          * also, sets final_va to NULL on pages that were unmapped.
859          */
860         if (unmap_unneeded_hugepages(tmp_hp, used_hp,
861                         internal_config.num_hugepage_sizes) < 0) {
862                 RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n");
863                 goto fail;
864         }
865
866         /*
867          * copy stuff from malloc'd hugepage* to the actual shared memory.
868          * this procedure only copies those hugepages that have final_va
869          * not NULL. has overflow protection.
870          */
871         if (copy_hugepages_to_shared_mem(hugepage, nrpages,
872                         tmp_hp, total_pages) < 0) {
873                 RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n");
874                 goto fail;
875         }
876
877         /* free the temporary hugepage table */
878         free(tmp_hp);
879         tmp_hp = NULL;
880
881         memset(mcfg->memseg, 0, sizeof(mcfg->memseg));
882         j = -1;
883         for (i = 0; i < nrpages; i++) {
884                 new_memseg = 0;
885
886                 /* if this is a new section, create a new memseg */
887                 if (i == 0)
888                         new_memseg = 1;
889                 else if (hugepage[i].socket_id != hugepage[i-1].socket_id)
890                         new_memseg = 1;
891                 else if (hugepage[i].size != hugepage[i-1].size)
892                         new_memseg = 1;
893                 else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
894                     hugepage[i].size)
895                         new_memseg = 1;
896                 else if (((unsigned long)hugepage[i].final_va -
897                     (unsigned long)hugepage[i-1].final_va) != hugepage[i].size)
898                         new_memseg = 1;
899
900                 if (new_memseg) {
901                         j += 1;
902                         if (j == RTE_MAX_MEMSEG)
903                                 break;
904
905                         mcfg->memseg[j].phys_addr = hugepage[i].physaddr;
906                         mcfg->memseg[j].addr = hugepage[i].final_va;
907                         mcfg->memseg[j].len = hugepage[i].size;
908                         mcfg->memseg[j].socket_id = hugepage[i].socket_id;
909                         mcfg->memseg[j].hugepage_sz = hugepage[i].size;
910                 }
911                 /* continuation of previous memseg */
912                 else {
913                         mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
914                 }
915                 hugepage[i].memseg_id = j;
916         }
917
918         if (i < nrpages) {
919                 RTE_LOG(ERR, EAL, "Can only reserve %d pages "
920                         "from %d requested\n"
921                         "Current %s=%d is not enough\n"
922                         "Please either increase it or request less amount "
923                         "of memory.\n",
924                         i, nrpages, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
925                         RTE_MAX_MEMSEG);
926                 return (-ENOMEM);
927         }
928         
929
930         return 0;
931
932
933 fail:
934         if (tmp_hp)
935                 free(tmp_hp);
936         return -1;
937 }
938
939 /*
940  * uses fstat to report the size of a file on disk
941  */
942 static off_t
943 getFileSize(int fd)
944 {
945         struct stat st;
946         if (fstat(fd, &st) < 0)
947                 return 0;
948         return st.st_size;
949 }
950
951 /*
952  * This creates the memory mappings in the secondary process to match that of
953  * the server process. It goes through each memory segment in the DPDK runtime
954  * configuration and finds the hugepages which form that segment, mapping them
955  * in order to form a contiguous block in the virtual memory space
956  */
957 static int
958 rte_eal_hugepage_attach(void)
959 {
960         const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
961         const struct hugepage *hp = NULL;
962         unsigned num_hp = 0;
963         unsigned i, s = 0; /* s used to track the segment number */
964         off_t size;
965         int fd, fd_zero = -1, fd_hugepage = -1;
966
967         if (aslr_enabled() > 0) {
968                 RTE_LOG(WARNING, EAL, "WARNING: Address Space Layout Randomization "
969                                 "(ASLR) is enabled in the kernel.\n");
970                 RTE_LOG(WARNING, EAL, "   This may cause issues with mapping memory "
971                                 "into secondary processes\n");
972         }
973
974         fd_zero = open("/dev/zero", O_RDONLY);
975         if (fd_zero < 0) {
976                 RTE_LOG(ERR, EAL, "Could not open /dev/zero\n");
977                 goto error;
978         }
979         fd_hugepage = open(eal_hugepage_info_path(), O_RDONLY);
980         if (fd_hugepage < 0) {
981                 RTE_LOG(ERR, EAL, "Could not open %s\n", eal_hugepage_info_path());
982                 goto error;
983         }
984
985         /* map all segments into memory to make sure we get the addrs */
986         for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
987                 void *base_addr;
988
989                 /*
990                  * the first memory segment with len==0 is the one that
991                  * follows the last valid segment.
992                  */
993                 if (mcfg->memseg[s].len == 0)
994                         break;
995
996                 /*
997                  * fdzero is mmapped to get a contiguous block of virtual
998                  * addresses of the appropriate memseg size.
999                  * use mmap to get identical addresses as the primary process.
1000                  */
1001                 base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
1002                                  PROT_READ, MAP_PRIVATE, fd_zero, 0);
1003                 if (base_addr == MAP_FAILED ||
1004                     base_addr != mcfg->memseg[s].addr) {
1005                         RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
1006                                 "in /dev/zero to requested address [%p]\n",
1007                                 (unsigned long long)mcfg->memseg[s].len,
1008                                 mcfg->memseg[s].addr);
1009                         if (aslr_enabled() > 0) {
1010                                 RTE_LOG(ERR, EAL, "It is recommended to "
1011                                         "disable ASLR in the kernel "
1012                                         "and retry running both primary "
1013                                         "and secondary processes\n");
1014                         }
1015                         goto error;
1016                 }
1017         }
1018
1019         size = getFileSize(fd_hugepage);
1020         hp = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd_hugepage, 0);
1021         if (hp == NULL) {
1022                 RTE_LOG(ERR, EAL, "Could not mmap %s\n", eal_hugepage_info_path());
1023                 goto error;
1024         }
1025
1026         num_hp = size / sizeof(struct hugepage);
1027         RTE_LOG(DEBUG, EAL, "Analysing %u hugepages\n", num_hp);
1028
1029         s = 0;
1030         while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
1031                 void *addr, *base_addr;
1032                 uintptr_t offset = 0;
1033
1034                 /*
1035                  * free previously mapped memory so we can map the
1036                  * hugepages into the space
1037                  */
1038                 base_addr = mcfg->memseg[s].addr;
1039                 munmap(base_addr, mcfg->memseg[s].len);
1040
1041                 /* find the hugepages for this segment and map them
1042                  * we don't need to worry about order, as the server sorted the
1043                  * entries before it did the second mmap of them */
1044                 for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
1045                         if (hp[i].memseg_id == (int)s){
1046                                 fd = open(hp[i].filepath, O_RDWR);
1047                                 if (fd < 0) {
1048                                         RTE_LOG(ERR, EAL, "Could not open %s\n",
1049                                                 hp[i].filepath);
1050                                         goto error;
1051                                 }
1052                                 addr = mmap(RTE_PTR_ADD(base_addr, offset),
1053                                                 hp[i].size, PROT_READ | PROT_WRITE,
1054                                                 MAP_SHARED | MAP_FIXED, fd, 0);
1055                                 close(fd); /* close file both on success and on failure */
1056                                 if (addr == MAP_FAILED) {
1057                                         RTE_LOG(ERR, EAL, "Could not mmap %s\n",
1058                                                 hp[i].filepath);
1059                                         goto error;
1060                                 }
1061                                 offset+=hp[i].size;
1062                         }
1063                 }
1064                 RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
1065                                 (unsigned long long)mcfg->memseg[s].len);
1066                 s++;
1067         }
1068         /* unmap the hugepage config file, since we are done using it */
1069         munmap((void *)(uintptr_t)hp, size);
1070         close(fd_zero);
1071         close(fd_hugepage);
1072         return 0;
1073
1074 error:
1075         if (fd_zero >= 0)
1076                 close(fd_zero);
1077         if (fd_hugepage >= 0)
1078                 close(fd_hugepage);
1079         return -1;
1080 }
1081
1082 static int
1083 rte_eal_memdevice_init(void)
1084 {
1085         struct rte_config *config;
1086
1087         if (rte_eal_process_type() == RTE_PROC_SECONDARY)
1088                 return 0;
1089
1090         config = rte_eal_get_configuration();
1091         config->mem_config->nchannel = internal_config.force_nchannel;
1092         config->mem_config->nrank = internal_config.force_nrank;
1093
1094         return 0;
1095 }
1096
1097
1098 /* init memory subsystem */
1099 int
1100 rte_eal_memory_init(void)
1101 {
1102         RTE_LOG(INFO, EAL, "Setting up hugepage memory...\n");
1103         const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
1104                         rte_eal_hugepage_init() :
1105                         rte_eal_hugepage_attach();
1106         if (retval < 0)
1107                 return -1;
1108
1109         if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
1110                 return -1;
1111
1112         return 0;
1113 }