examples/l3fwd: merge l3fwd-acl example
[dpdk.git] / lib / eal / linux / eal_hugepage_info.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation
3  */
4
5 #include <string.h>
6 #include <sys/file.h>
7 #include <dirent.h>
8 #include <fcntl.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <stdio.h>
12 #include <fnmatch.h>
13 #include <inttypes.h>
14 #include <unistd.h>
15 #include <errno.h>
16 #include <sys/mman.h>
17 #include <sys/stat.h>
18
19 #include <linux/mman.h> /* for hugetlb-related flags */
20
21 #include <rte_lcore.h>
22 #include <rte_debug.h>
23 #include <rte_log.h>
24 #include <rte_common.h>
25 #include "rte_string_fns.h"
26
27 #include "eal_private.h"
28 #include "eal_internal_cfg.h"
29 #include "eal_hugepages.h"
30 #include "eal_filesystem.h"
31
32 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
33 static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
34
35 /*
36  * Uses mmap to create a shared memory area for storage of data
37  * Used in this file to store the hugepage file map on disk
38  */
39 static void *
40 map_shared_memory(const char *filename, const size_t mem_size, int flags)
41 {
42         void *retval;
43         int fd = open(filename, flags, 0600);
44         if (fd < 0)
45                 return NULL;
46         if (ftruncate(fd, mem_size) < 0) {
47                 close(fd);
48                 return NULL;
49         }
50         retval = mmap(NULL, mem_size, PROT_READ | PROT_WRITE,
51                         MAP_SHARED, fd, 0);
52         close(fd);
53         return retval;
54 }
55
56 static void *
57 open_shared_memory(const char *filename, const size_t mem_size)
58 {
59         return map_shared_memory(filename, mem_size, O_RDWR);
60 }
61
62 static void *
63 create_shared_memory(const char *filename, const size_t mem_size)
64 {
65         return map_shared_memory(filename, mem_size, O_RDWR | O_CREAT);
66 }
67
68 static int get_hp_sysfs_value(const char *subdir, const char *file, unsigned long *val)
69 {
70         char path[PATH_MAX];
71
72         snprintf(path, sizeof(path), "%s/%s/%s",
73                         sys_dir_path, subdir, file);
74         return eal_parse_sysfs_value(path, val);
75 }
76
77 /* this function is only called from eal_hugepage_info_init which itself
78  * is only called from a primary process */
79 static uint32_t
80 get_num_hugepages(const char *subdir, size_t sz, unsigned int reusable_pages)
81 {
82         unsigned long resv_pages, num_pages, over_pages, surplus_pages;
83         const char *nr_hp_file = "free_hugepages";
84         const char *nr_rsvd_file = "resv_hugepages";
85         const char *nr_over_file = "nr_overcommit_hugepages";
86         const char *nr_splus_file = "surplus_hugepages";
87
88         /* first, check how many reserved pages kernel reports */
89         if (get_hp_sysfs_value(subdir, nr_rsvd_file, &resv_pages) < 0)
90                 return 0;
91
92         if (get_hp_sysfs_value(subdir, nr_hp_file, &num_pages) < 0)
93                 return 0;
94
95         if (get_hp_sysfs_value(subdir, nr_over_file, &over_pages) < 0)
96                 over_pages = 0;
97
98         if (get_hp_sysfs_value(subdir, nr_splus_file, &surplus_pages) < 0)
99                 surplus_pages = 0;
100
101         /* adjust num_pages */
102         if (num_pages >= resv_pages)
103                 num_pages -= resv_pages;
104         else if (resv_pages)
105                 num_pages = 0;
106
107         if (over_pages >= surplus_pages)
108                 over_pages -= surplus_pages;
109         else
110                 over_pages = 0;
111
112         if (num_pages == 0 && over_pages == 0 && reusable_pages)
113                 RTE_LOG(WARNING, EAL, "No available %zu kB hugepages reported\n",
114                                 sz >> 10);
115
116         num_pages += over_pages;
117         if (num_pages < over_pages) /* overflow */
118                 num_pages = UINT32_MAX;
119
120         num_pages += reusable_pages;
121         if (num_pages < reusable_pages) /* overflow */
122                 num_pages = UINT32_MAX;
123
124         /* we want to return a uint32_t and more than this looks suspicious
125          * anyway ... */
126         if (num_pages > UINT32_MAX)
127                 num_pages = UINT32_MAX;
128
129         return num_pages;
130 }
131
132 static uint32_t
133 get_num_hugepages_on_node(const char *subdir, unsigned int socket, size_t sz)
134 {
135         char path[PATH_MAX], socketpath[PATH_MAX];
136         DIR *socketdir;
137         unsigned long num_pages = 0;
138         const char *nr_hp_file = "free_hugepages";
139
140         snprintf(socketpath, sizeof(socketpath), "%s/node%u/hugepages",
141                 sys_pages_numa_dir_path, socket);
142
143         socketdir = opendir(socketpath);
144         if (socketdir) {
145                 /* Keep calm and carry on */
146                 closedir(socketdir);
147         } else {
148                 /* Can't find socket dir, so ignore it */
149                 return 0;
150         }
151
152         snprintf(path, sizeof(path), "%s/%s/%s",
153                         socketpath, subdir, nr_hp_file);
154         if (eal_parse_sysfs_value(path, &num_pages) < 0)
155                 return 0;
156
157         if (num_pages == 0)
158                 RTE_LOG(WARNING, EAL, "No free %zu kB hugepages reported on node %u\n",
159                                 sz >> 10, socket);
160
161         /*
162          * we want to return a uint32_t and more than this looks suspicious
163          * anyway ...
164          */
165         if (num_pages > UINT32_MAX)
166                 num_pages = UINT32_MAX;
167
168         return num_pages;
169 }
170
171 static uint64_t
172 get_default_hp_size(void)
173 {
174         const char proc_meminfo[] = "/proc/meminfo";
175         const char str_hugepagesz[] = "Hugepagesize:";
176         unsigned hugepagesz_len = sizeof(str_hugepagesz) - 1;
177         char buffer[256];
178         unsigned long long size = 0;
179
180         FILE *fd = fopen(proc_meminfo, "r");
181         if (fd == NULL)
182                 rte_panic("Cannot open %s\n", proc_meminfo);
183         while(fgets(buffer, sizeof(buffer), fd)){
184                 if (strncmp(buffer, str_hugepagesz, hugepagesz_len) == 0){
185                         size = rte_str_to_size(&buffer[hugepagesz_len]);
186                         break;
187                 }
188         }
189         fclose(fd);
190         if (size == 0)
191                 rte_panic("Cannot get default hugepage size from %s\n", proc_meminfo);
192         return size;
193 }
194
195 static int
196 get_hugepage_dir(uint64_t hugepage_sz, char *hugedir, int len)
197 {
198         enum proc_mount_fieldnames {
199                 DEVICE = 0,
200                 MOUNTPT,
201                 FSTYPE,
202                 OPTIONS,
203                 _FIELDNAME_MAX
204         };
205         static uint64_t default_size = 0;
206         const char proc_mounts[] = "/proc/mounts";
207         const char hugetlbfs_str[] = "hugetlbfs";
208         const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
209         const char pagesize_opt[] = "pagesize=";
210         const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
211         const char split_tok = ' ';
212         char *splitstr[_FIELDNAME_MAX];
213         char found[PATH_MAX] = "";
214         char buf[BUFSIZ];
215         const struct internal_config *internal_conf =
216                 eal_get_internal_configuration();
217         struct stat st;
218
219         /*
220          * If the specified dir doesn't exist, we can't match it.
221          */
222         if (internal_conf->hugepage_dir != NULL &&
223                 stat(internal_conf->hugepage_dir, &st) != 0) {
224                 return -1;
225         }
226
227         FILE *fd = fopen(proc_mounts, "r");
228         if (fd == NULL)
229                 rte_panic("Cannot open %s\n", proc_mounts);
230
231         if (default_size == 0)
232                 default_size = get_default_hp_size();
233
234         while (fgets(buf, sizeof(buf), fd)){
235                 const char *pagesz_str;
236
237                 if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
238                                 split_tok) != _FIELDNAME_MAX) {
239                         RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
240                         break; /* return NULL */
241                 }
242
243                 if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) != 0)
244                         continue;
245
246                 pagesz_str = strstr(splitstr[OPTIONS], pagesize_opt);
247
248                 /* if no explicit page size, the default page size is compared */
249                 if (pagesz_str == NULL) {
250                         if (hugepage_sz != default_size)
251                                 continue;
252                 }
253                 /* there is an explicit page size, so check it */
254                 else {
255                         uint64_t pagesz = rte_str_to_size(&pagesz_str[pagesize_opt_len]);
256                         if (pagesz != hugepage_sz)
257                                 continue;
258                 }
259
260                 /*
261                  * If no --huge-dir option has been given, we're done.
262                  */
263                 if (internal_conf->hugepage_dir == NULL) {
264                         strlcpy(found, splitstr[MOUNTPT], len);
265                         break;
266                 }
267
268                 /*
269                  * Ignore any mount that doesn't contain the --huge-dir
270                  * directory.
271                  */
272                 if (strncmp(internal_conf->hugepage_dir, splitstr[MOUNTPT],
273                         strlen(splitstr[MOUNTPT])) != 0) {
274                         continue;
275                 }
276
277                 /*
278                  * We found a match, but only prefer it if it's a longer match
279                  * (so /mnt/1 is preferred over /mnt for matching /mnt/1/2)).
280                  */
281                 if (strlen(splitstr[MOUNTPT]) > strlen(found))
282                         strlcpy(found, splitstr[MOUNTPT], len);
283         } /* end while fgets */
284
285         fclose(fd);
286
287         if (found[0] != '\0') {
288                 /* If needed, return the requested dir, not the mount point. */
289                 strlcpy(hugedir, internal_conf->hugepage_dir != NULL ?
290                         internal_conf->hugepage_dir : found, len);
291                 return 0;
292         }
293
294         return -1;
295 }
296
297 struct walk_hugedir_data {
298         int dir_fd;
299         int file_fd;
300         const char *file_name;
301         void *user_data;
302 };
303
304 typedef void (walk_hugedir_t)(const struct walk_hugedir_data *whd);
305
306 /*
307  * Search the hugepage directory for whatever hugepage files there are.
308  * Check if the file is in use by another DPDK process.
309  * If not, execute a callback on it.
310  */
311 static int
312 walk_hugedir(const char *hugedir, walk_hugedir_t *cb, void *user_data)
313 {
314         DIR *dir;
315         struct dirent *dirent;
316         int dir_fd, fd, lck_result;
317         const char filter[] = "*map_*"; /* matches hugepage files */
318
319         dir = opendir(hugedir);
320         if (!dir) {
321                 RTE_LOG(ERR, EAL, "Unable to open hugepage directory %s\n",
322                                 hugedir);
323                 goto error;
324         }
325         dir_fd = dirfd(dir);
326
327         dirent = readdir(dir);
328         if (!dirent) {
329                 RTE_LOG(ERR, EAL, "Unable to read hugepage directory %s\n",
330                                 hugedir);
331                 goto error;
332         }
333
334         while (dirent != NULL) {
335                 /* skip files that don't match the hugepage pattern */
336                 if (fnmatch(filter, dirent->d_name, 0) > 0) {
337                         dirent = readdir(dir);
338                         continue;
339                 }
340
341                 /* try and lock the file */
342                 fd = openat(dir_fd, dirent->d_name, O_RDONLY);
343
344                 /* skip to next file */
345                 if (fd == -1) {
346                         dirent = readdir(dir);
347                         continue;
348                 }
349
350                 /* non-blocking lock */
351                 lck_result = flock(fd, LOCK_EX | LOCK_NB);
352
353                 /* if lock succeeds, execute callback */
354                 if (lck_result != -1)
355                         cb(&(struct walk_hugedir_data){
356                                 .dir_fd = dir_fd,
357                                 .file_fd = fd,
358                                 .file_name = dirent->d_name,
359                                 .user_data = user_data,
360                         });
361
362                 close (fd);
363                 dirent = readdir(dir);
364         }
365
366         closedir(dir);
367         return 0;
368
369 error:
370         if (dir)
371                 closedir(dir);
372
373         RTE_LOG(ERR, EAL, "Error while walking hugepage dir: %s\n",
374                 strerror(errno));
375
376         return -1;
377 }
378
379 static void
380 clear_hugedir_cb(const struct walk_hugedir_data *whd)
381 {
382         unlinkat(whd->dir_fd, whd->file_name, 0);
383 }
384
385 /* Remove hugepage files not used by other DPDK processes from a directory. */
386 static int
387 clear_hugedir(const char *hugedir)
388 {
389         return walk_hugedir(hugedir, clear_hugedir_cb, NULL);
390 }
391
392 static void
393 inspect_hugedir_cb(const struct walk_hugedir_data *whd)
394 {
395         uint64_t *total_size = whd->user_data;
396         struct stat st;
397
398         if (fstat(whd->file_fd, &st) < 0)
399                 RTE_LOG(DEBUG, EAL, "%s(): stat(\"%s\") failed: %s",
400                                 __func__, whd->file_name, strerror(errno));
401         else
402                 (*total_size) += st.st_size;
403 }
404
405 /*
406  * Count the total size in bytes of all files in the directory
407  * not mapped by other DPDK process.
408  */
409 static int
410 inspect_hugedir(const char *hugedir, uint64_t *total_size)
411 {
412         return walk_hugedir(hugedir, inspect_hugedir_cb, total_size);
413 }
414
415 static int
416 compare_hpi(const void *a, const void *b)
417 {
418         const struct hugepage_info *hpi_a = a;
419         const struct hugepage_info *hpi_b = b;
420
421         return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
422 }
423
424 static void
425 calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent,
426                 unsigned int reusable_pages)
427 {
428         uint64_t total_pages = 0;
429         unsigned int i;
430         const struct internal_config *internal_conf =
431                 eal_get_internal_configuration();
432
433         /*
434          * first, try to put all hugepages into relevant sockets, but
435          * if first attempts fails, fall back to collecting all pages
436          * in one socket and sorting them later
437          */
438         total_pages = 0;
439
440         /*
441          * We also don't want to do this for legacy init.
442          * When there are hugepage files to reuse it is unknown
443          * what NUMA node the pages are on.
444          * This could be determined by mapping,
445          * but it is precisely what hugepage file reuse is trying to avoid.
446          */
447         if (!internal_conf->legacy_mem && reusable_pages == 0)
448                 for (i = 0; i < rte_socket_count(); i++) {
449                         int socket = rte_socket_id_by_idx(i);
450                         unsigned int num_pages =
451                                         get_num_hugepages_on_node(
452                                                 dirent->d_name, socket,
453                                                 hpi->hugepage_sz);
454                         hpi->num_pages[socket] = num_pages;
455                         total_pages += num_pages;
456                 }
457         /*
458          * we failed to sort memory from the get go, so fall
459          * back to old way
460          */
461         if (total_pages == 0) {
462                 hpi->num_pages[0] = get_num_hugepages(dirent->d_name,
463                                 hpi->hugepage_sz, reusable_pages);
464
465 #ifndef RTE_ARCH_64
466                 /* for 32-bit systems, limit number of hugepages to
467                  * 1GB per page size */
468                 hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
469                                 RTE_PGSIZE_1G / hpi->hugepage_sz);
470 #endif
471         }
472 }
473
474 static int
475 hugepage_info_init(void)
476 {       const char dirent_start_text[] = "hugepages-";
477         const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
478         unsigned int i, num_sizes = 0;
479         uint64_t reusable_bytes;
480         unsigned int reusable_pages;
481         DIR *dir;
482         struct dirent *dirent;
483         struct internal_config *internal_conf =
484                 eal_get_internal_configuration();
485
486         dir = opendir(sys_dir_path);
487         if (dir == NULL) {
488                 RTE_LOG(ERR, EAL,
489                         "Cannot open directory %s to read system hugepage info\n",
490                         sys_dir_path);
491                 return -1;
492         }
493
494         for (dirent = readdir(dir); dirent != NULL; dirent = readdir(dir)) {
495                 struct hugepage_info *hpi;
496
497                 if (strncmp(dirent->d_name, dirent_start_text,
498                             dirent_start_len) != 0)
499                         continue;
500
501                 if (num_sizes >= MAX_HUGEPAGE_SIZES)
502                         break;
503
504                 hpi = &internal_conf->hugepage_info[num_sizes];
505                 hpi->hugepage_sz =
506                         rte_str_to_size(&dirent->d_name[dirent_start_len]);
507
508                 /* first, check if we have a mountpoint */
509                 if (get_hugepage_dir(hpi->hugepage_sz,
510                         hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
511                         uint32_t num_pages;
512
513                         num_pages = get_num_hugepages(dirent->d_name,
514                                         hpi->hugepage_sz, 0);
515                         if (num_pages > 0)
516                                 RTE_LOG(NOTICE, EAL,
517                                         "%" PRIu32 " hugepages of size "
518                                         "%" PRIu64 " reserved, but no mounted "
519                                         "hugetlbfs found for that size\n",
520                                         num_pages, hpi->hugepage_sz);
521                         /* if we have kernel support for reserving hugepages
522                          * through mmap, and we're in in-memory mode, treat this
523                          * page size as valid. we cannot be in legacy mode at
524                          * this point because we've checked this earlier in the
525                          * init process.
526                          */
527 #ifdef MAP_HUGE_SHIFT
528                         if (internal_conf->in_memory) {
529                                 RTE_LOG(DEBUG, EAL, "In-memory mode enabled, "
530                                         "hugepages of size %" PRIu64 " bytes "
531                                         "will be allocated anonymously\n",
532                                         hpi->hugepage_sz);
533                                 calc_num_pages(hpi, dirent, 0);
534                                 num_sizes++;
535                         }
536 #endif
537                         continue;
538                 }
539
540                 /* try to obtain a writelock */
541                 hpi->lock_descriptor = open(hpi->hugedir, O_RDONLY);
542
543                 /* if blocking lock failed */
544                 if (flock(hpi->lock_descriptor, LOCK_EX) == -1) {
545                         RTE_LOG(CRIT, EAL,
546                                 "Failed to lock hugepage directory!\n");
547                         break;
548                 }
549
550                 /*
551                  * Check for existing hugepage files and either remove them
552                  * or count how many of them can be reused.
553                  */
554                 reusable_pages = 0;
555                 if (!internal_conf->hugepage_file.unlink_existing) {
556                         reusable_bytes = 0;
557                         if (inspect_hugedir(hpi->hugedir,
558                                         &reusable_bytes) < 0)
559                                 break;
560                         RTE_ASSERT(reusable_bytes % hpi->hugepage_sz == 0);
561                         reusable_pages = reusable_bytes / hpi->hugepage_sz;
562                 } else if (clear_hugedir(hpi->hugedir) < 0) {
563                         break;
564                 }
565                 calc_num_pages(hpi, dirent, reusable_pages);
566
567                 num_sizes++;
568         }
569         closedir(dir);
570
571         /* something went wrong, and we broke from the for loop above */
572         if (dirent != NULL)
573                 return -1;
574
575         internal_conf->num_hugepage_sizes = num_sizes;
576
577         /* sort the page directory entries by size, largest to smallest */
578         qsort(&internal_conf->hugepage_info[0], num_sizes,
579               sizeof(internal_conf->hugepage_info[0]), compare_hpi);
580
581         /* now we have all info, check we have at least one valid size */
582         for (i = 0; i < num_sizes; i++) {
583                 /* pages may no longer all be on socket 0, so check all */
584                 unsigned int j, num_pages = 0;
585                 struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
586
587                 for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
588                         num_pages += hpi->num_pages[j];
589                 if (num_pages > 0)
590                         return 0;
591         }
592
593         /* no valid hugepage mounts available, return error */
594         return -1;
595 }
596
597 /*
598  * when we initialize the hugepage info, everything goes
599  * to socket 0 by default. it will later get sorted by memory
600  * initialization procedure.
601  */
602 int
603 eal_hugepage_info_init(void)
604 {
605         struct hugepage_info *hpi, *tmp_hpi;
606         unsigned int i;
607         struct internal_config *internal_conf =
608                 eal_get_internal_configuration();
609
610         if (hugepage_info_init() < 0)
611                 return -1;
612
613         /* for no shared files mode, we're done */
614         if (internal_conf->no_shconf)
615                 return 0;
616
617         hpi = &internal_conf->hugepage_info[0];
618
619         tmp_hpi = create_shared_memory(eal_hugepage_info_path(),
620                         sizeof(internal_conf->hugepage_info));
621         if (tmp_hpi == NULL) {
622                 RTE_LOG(ERR, EAL, "Failed to create shared memory!\n");
623                 return -1;
624         }
625
626         memcpy(tmp_hpi, hpi, sizeof(internal_conf->hugepage_info));
627
628         /* we've copied file descriptors along with everything else, but they
629          * will be invalid in secondary process, so overwrite them
630          */
631         for (i = 0; i < RTE_DIM(internal_conf->hugepage_info); i++) {
632                 struct hugepage_info *tmp = &tmp_hpi[i];
633                 tmp->lock_descriptor = -1;
634         }
635
636         if (munmap(tmp_hpi, sizeof(internal_conf->hugepage_info)) < 0) {
637                 RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
638                 return -1;
639         }
640         return 0;
641 }
642
643 int eal_hugepage_info_read(void)
644 {
645         struct internal_config *internal_conf =
646                 eal_get_internal_configuration();
647         struct hugepage_info *hpi = &internal_conf->hugepage_info[0];
648         struct hugepage_info *tmp_hpi;
649
650         tmp_hpi = open_shared_memory(eal_hugepage_info_path(),
651                                   sizeof(internal_conf->hugepage_info));
652         if (tmp_hpi == NULL) {
653                 RTE_LOG(ERR, EAL, "Failed to open shared memory!\n");
654                 return -1;
655         }
656
657         memcpy(hpi, tmp_hpi, sizeof(internal_conf->hugepage_info));
658
659         if (munmap(tmp_hpi, sizeof(internal_conf->hugepage_info)) < 0) {
660                 RTE_LOG(ERR, EAL, "Failed to unmap shared memory!\n");
661                 return -1;
662         }
663         return 0;
664 }