sched: move grinder configuration
[dpdk.git] / lib / eal / common / eal_common_dynmem.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation.
3  * Copyright(c) 2013 6WIND S.A.
4  */
5
6 #include <inttypes.h>
7 #include <string.h>
8
9 #include <rte_log.h>
10 #include <rte_string_fns.h>
11
12 #include "eal_internal_cfg.h"
13 #include "eal_memalloc.h"
14 #include "eal_memcfg.h"
15 #include "eal_private.h"
16
17 /** @file Functions common to EALs that support dynamic memory allocation. */
18
19 int
20 eal_dynmem_memseg_lists_init(void)
21 {
22         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
23         struct memtype {
24                 uint64_t page_sz;
25                 int socket_id;
26         } *memtypes = NULL;
27         int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
28         struct rte_memseg_list *msl;
29         uint64_t max_mem, max_mem_per_type;
30         unsigned int max_seglists_per_type;
31         unsigned int n_memtypes, cur_type;
32         struct internal_config *internal_conf =
33                 eal_get_internal_configuration();
34
35         /* no-huge does not need this at all */
36         if (internal_conf->no_hugetlbfs)
37                 return 0;
38
39         /*
40          * figuring out amount of memory we're going to have is a long and very
41          * involved process. the basic element we're operating with is a memory
42          * type, defined as a combination of NUMA node ID and page size (so that
43          * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
44          *
45          * deciding amount of memory going towards each memory type is a
46          * balancing act between maximum segments per type, maximum memory per
47          * type, and number of detected NUMA nodes. the goal is to make sure
48          * each memory type gets at least one memseg list.
49          *
50          * the total amount of memory is limited by RTE_MAX_MEM_MB value.
51          *
52          * the total amount of memory per type is limited by either
53          * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
54          * of detected NUMA nodes. additionally, maximum number of segments per
55          * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
56          * smaller page sizes, it can take hundreds of thousands of segments to
57          * reach the above specified per-type memory limits.
58          *
59          * additionally, each type may have multiple memseg lists associated
60          * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
61          * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
62          *
63          * the number of memseg lists per type is decided based on the above
64          * limits, and also taking number of detected NUMA nodes, to make sure
65          * that we don't run out of memseg lists before we populate all NUMA
66          * nodes with memory.
67          *
68          * we do this in three stages. first, we collect the number of types.
69          * then, we figure out memory constraints and populate the list of
70          * would-be memseg lists. then, we go ahead and allocate the memseg
71          * lists.
72          */
73
74         /* create space for mem types */
75         n_memtypes = internal_conf->num_hugepage_sizes * rte_socket_count();
76         memtypes = calloc(n_memtypes, sizeof(*memtypes));
77         if (memtypes == NULL) {
78                 RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
79                 return -1;
80         }
81
82         /* populate mem types */
83         cur_type = 0;
84         for (hpi_idx = 0; hpi_idx < (int) internal_conf->num_hugepage_sizes;
85                         hpi_idx++) {
86                 struct hugepage_info *hpi;
87                 uint64_t hugepage_sz;
88
89                 hpi = &internal_conf->hugepage_info[hpi_idx];
90                 hugepage_sz = hpi->hugepage_sz;
91
92                 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
93                         int socket_id = rte_socket_id_by_idx(i);
94
95 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
96                         /* we can still sort pages by socket in legacy mode */
97                         if (!internal_conf->legacy_mem && socket_id > 0)
98                                 break;
99 #endif
100                         memtypes[cur_type].page_sz = hugepage_sz;
101                         memtypes[cur_type].socket_id = socket_id;
102
103                         RTE_LOG(DEBUG, EAL, "Detected memory type: "
104                                 "socket_id:%u hugepage_sz:%" PRIu64 "\n",
105                                 socket_id, hugepage_sz);
106                 }
107         }
108         /* number of memtypes could have been lower due to no NUMA support */
109         n_memtypes = cur_type;
110
111         /* set up limits for types */
112         max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
113         max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
114                         max_mem / n_memtypes);
115         /*
116          * limit maximum number of segment lists per type to ensure there's
117          * space for memseg lists for all NUMA nodes with all page sizes
118          */
119         max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
120
121         if (max_seglists_per_type == 0) {
122                 RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
123                         RTE_STR(RTE_MAX_MEMSEG_LISTS));
124                 goto out;
125         }
126
127         /* go through all mem types and create segment lists */
128         msl_idx = 0;
129         for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
130                 unsigned int cur_seglist, n_seglists, n_segs;
131                 unsigned int max_segs_per_type, max_segs_per_list;
132                 struct memtype *type = &memtypes[cur_type];
133                 uint64_t max_mem_per_list, pagesz;
134                 int socket_id;
135
136                 pagesz = type->page_sz;
137                 socket_id = type->socket_id;
138
139                 /*
140                  * we need to create segment lists for this type. we must take
141                  * into account the following things:
142                  *
143                  * 1. total amount of memory we can use for this memory type
144                  * 2. total amount of memory per memseg list allowed
145                  * 3. number of segments needed to fit the amount of memory
146                  * 4. number of segments allowed per type
147                  * 5. number of segments allowed per memseg list
148                  * 6. number of memseg lists we are allowed to take up
149                  */
150
151                 /* calculate how much segments we will need in total */
152                 max_segs_per_type = max_mem_per_type / pagesz;
153                 /* limit number of segments to maximum allowed per type */
154                 max_segs_per_type = RTE_MIN(max_segs_per_type,
155                                 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
156                 /* limit number of segments to maximum allowed per list */
157                 max_segs_per_list = RTE_MIN(max_segs_per_type,
158                                 (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
159
160                 /* calculate how much memory we can have per segment list */
161                 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
162                                 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
163
164                 /* calculate how many segments each segment list will have */
165                 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
166
167                 /* calculate how many segment lists we can have */
168                 n_seglists = RTE_MIN(max_segs_per_type / n_segs,
169                                 max_mem_per_type / max_mem_per_list);
170
171                 /* limit number of segment lists according to our maximum */
172                 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
173
174                 RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
175                                 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
176                         n_seglists, n_segs, socket_id, pagesz);
177
178                 /* create all segment lists */
179                 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
180                         if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
181                                 RTE_LOG(ERR, EAL,
182                                         "No more space in memseg lists, please increase %s\n",
183                                         RTE_STR(RTE_MAX_MEMSEG_LISTS));
184                                 goto out;
185                         }
186                         msl = &mcfg->memsegs[msl_idx++];
187
188                         if (eal_memseg_list_init(msl, pagesz, n_segs,
189                                         socket_id, cur_seglist, true))
190                                 goto out;
191
192                         if (eal_memseg_list_alloc(msl, 0)) {
193                                 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
194                                 goto out;
195                         }
196                 }
197         }
198         /* we're successful */
199         ret = 0;
200 out:
201         free(memtypes);
202         return ret;
203 }
204
205 static int __rte_unused
206 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
207 {
208         struct hugepage_info *hpi = arg;
209
210         if (msl->page_sz != hpi->hugepage_sz)
211                 return 0;
212
213         hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
214         return 0;
215 }
216
217 static int
218 limits_callback(int socket_id, size_t cur_limit, size_t new_len)
219 {
220         RTE_SET_USED(socket_id);
221         RTE_SET_USED(cur_limit);
222         RTE_SET_USED(new_len);
223         return -1;
224 }
225
226 int
227 eal_dynmem_hugepage_init(void)
228 {
229         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
230         uint64_t memory[RTE_MAX_NUMA_NODES];
231         int hp_sz_idx, socket_id;
232         struct internal_config *internal_conf =
233                 eal_get_internal_configuration();
234
235         memset(used_hp, 0, sizeof(used_hp));
236
237         for (hp_sz_idx = 0;
238                         hp_sz_idx < (int) internal_conf->num_hugepage_sizes;
239                         hp_sz_idx++) {
240 #ifndef RTE_ARCH_64
241                 struct hugepage_info dummy;
242                 unsigned int i;
243 #endif
244                 /* also initialize used_hp hugepage sizes in used_hp */
245                 struct hugepage_info *hpi;
246                 hpi = &internal_conf->hugepage_info[hp_sz_idx];
247                 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
248
249 #ifndef RTE_ARCH_64
250                 /* for 32-bit, limit number of pages on socket to whatever we've
251                  * preallocated, as we cannot allocate more.
252                  */
253                 memset(&dummy, 0, sizeof(dummy));
254                 dummy.hugepage_sz = hpi->hugepage_sz;
255                 if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
256                         return -1;
257
258                 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
259                         hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
260                                         dummy.num_pages[i]);
261                 }
262 #endif
263         }
264
265         /* make a copy of socket_mem, needed for balanced allocation. */
266         for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
267                 memory[hp_sz_idx] = internal_conf->socket_mem[hp_sz_idx];
268
269         /* calculate final number of pages */
270         if (eal_dynmem_calc_num_pages_per_socket(memory,
271                         internal_conf->hugepage_info, used_hp,
272                         internal_conf->num_hugepage_sizes) < 0)
273                 return -1;
274
275         for (hp_sz_idx = 0;
276                         hp_sz_idx < (int)internal_conf->num_hugepage_sizes;
277                         hp_sz_idx++) {
278                 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
279                                 socket_id++) {
280                         struct rte_memseg **pages;
281                         struct hugepage_info *hpi = &used_hp[hp_sz_idx];
282                         unsigned int num_pages = hpi->num_pages[socket_id];
283                         unsigned int num_pages_alloc;
284
285                         if (num_pages == 0)
286                                 continue;
287
288                         RTE_LOG(DEBUG, EAL,
289                                 "Allocating %u pages of size %" PRIu64 "M "
290                                 "on socket %i\n",
291                                 num_pages, hpi->hugepage_sz >> 20, socket_id);
292
293                         /* we may not be able to allocate all pages in one go,
294                          * because we break up our memory map into multiple
295                          * memseg lists. therefore, try allocating multiple
296                          * times and see if we can get the desired number of
297                          * pages from multiple allocations.
298                          */
299
300                         num_pages_alloc = 0;
301                         do {
302                                 int i, cur_pages, needed;
303
304                                 needed = num_pages - num_pages_alloc;
305
306                                 pages = malloc(sizeof(*pages) * needed);
307                                 if (pages == NULL) {
308                                         RTE_LOG(ERR, EAL, "Failed to malloc pages\n");
309                                         return -1;
310                                 }
311
312                                 /* do not request exact number of pages */
313                                 cur_pages = eal_memalloc_alloc_seg_bulk(pages,
314                                                 needed, hpi->hugepage_sz,
315                                                 socket_id, false);
316                                 if (cur_pages <= 0) {
317                                         free(pages);
318                                         return -1;
319                                 }
320
321                                 /* mark preallocated pages as unfreeable */
322                                 for (i = 0; i < cur_pages; i++) {
323                                         struct rte_memseg *ms = pages[i];
324                                         ms->flags |=
325                                                 RTE_MEMSEG_FLAG_DO_NOT_FREE;
326                                 }
327                                 free(pages);
328
329                                 num_pages_alloc += cur_pages;
330                         } while (num_pages_alloc != num_pages);
331                 }
332         }
333
334         /* if socket limits were specified, set them */
335         if (internal_conf->force_socket_limits) {
336                 unsigned int i;
337                 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
338                         uint64_t limit = internal_conf->socket_limit[i];
339                         if (limit == 0)
340                                 continue;
341                         if (rte_mem_alloc_validator_register("socket-limit",
342                                         limits_callback, i, limit))
343                                 RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
344                 }
345         }
346         return 0;
347 }
348
349 __rte_unused /* function is unused on 32-bit builds */
350 static inline uint64_t
351 get_socket_mem_size(int socket)
352 {
353         uint64_t size = 0;
354         unsigned int i;
355         struct internal_config *internal_conf =
356                 eal_get_internal_configuration();
357
358         for (i = 0; i < internal_conf->num_hugepage_sizes; i++) {
359                 struct hugepage_info *hpi = &internal_conf->hugepage_info[i];
360                 size += hpi->hugepage_sz * hpi->num_pages[socket];
361         }
362
363         return size;
364 }
365
366 int
367 eal_dynmem_calc_num_pages_per_socket(
368         uint64_t *memory, struct hugepage_info *hp_info,
369         struct hugepage_info *hp_used, unsigned int num_hp_info)
370 {
371         unsigned int socket, j, i = 0;
372         unsigned int requested, available;
373         int total_num_pages = 0;
374         uint64_t remaining_mem, cur_mem;
375         const struct internal_config *internal_conf =
376                 eal_get_internal_configuration();
377         uint64_t total_mem = internal_conf->memory;
378
379         if (num_hp_info == 0)
380                 return -1;
381
382         /* if specific memory amounts per socket weren't requested */
383         if (internal_conf->force_sockets == 0) {
384                 size_t total_size;
385 #ifdef RTE_ARCH_64
386                 int cpu_per_socket[RTE_MAX_NUMA_NODES];
387                 size_t default_size;
388                 unsigned int lcore_id;
389
390                 /* Compute number of cores per socket */
391                 memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
392                 RTE_LCORE_FOREACH(lcore_id) {
393                         cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
394                 }
395
396                 /*
397                  * Automatically spread requested memory amongst detected
398                  * sockets according to number of cores from CPU mask present
399                  * on each socket.
400                  */
401                 total_size = internal_conf->memory;
402                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
403                                 socket++) {
404
405                         /* Set memory amount per socket */
406                         default_size = internal_conf->memory *
407                                 cpu_per_socket[socket] / rte_lcore_count();
408
409                         /* Limit to maximum available memory on socket */
410                         default_size = RTE_MIN(
411                                 default_size, get_socket_mem_size(socket));
412
413                         /* Update sizes */
414                         memory[socket] = default_size;
415                         total_size -= default_size;
416                 }
417
418                 /*
419                  * If some memory is remaining, try to allocate it by getting
420                  * all available memory from sockets, one after the other.
421                  */
422                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
423                                 socket++) {
424                         /* take whatever is available */
425                         default_size = RTE_MIN(
426                                 get_socket_mem_size(socket) - memory[socket],
427                                 total_size);
428
429                         /* Update sizes */
430                         memory[socket] += default_size;
431                         total_size -= default_size;
432                 }
433 #else
434                 /* in 32-bit mode, allocate all of the memory only on main
435                  * lcore socket
436                  */
437                 total_size = internal_conf->memory;
438                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
439                                 socket++) {
440                         struct rte_config *cfg = rte_eal_get_configuration();
441                         unsigned int main_lcore_socket;
442
443                         main_lcore_socket =
444                                 rte_lcore_to_socket_id(cfg->main_lcore);
445
446                         if (main_lcore_socket != socket)
447                                 continue;
448
449                         /* Update sizes */
450                         memory[socket] = total_size;
451                         break;
452                 }
453 #endif
454         }
455
456         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0;
457                         socket++) {
458                 /* skips if the memory on specific socket wasn't requested */
459                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) {
460                         rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir,
461                                 sizeof(hp_used[i].hugedir));
462                         hp_used[i].num_pages[socket] = RTE_MIN(
463                                         memory[socket] / hp_info[i].hugepage_sz,
464                                         hp_info[i].num_pages[socket]);
465
466                         cur_mem = hp_used[i].num_pages[socket] *
467                                         hp_used[i].hugepage_sz;
468
469                         memory[socket] -= cur_mem;
470                         total_mem -= cur_mem;
471
472                         total_num_pages += hp_used[i].num_pages[socket];
473
474                         /* check if we have met all memory requests */
475                         if (memory[socket] == 0)
476                                 break;
477
478                         /* Check if we have any more pages left at this size,
479                          * if so, move on to next size.
480                          */
481                         if (hp_used[i].num_pages[socket] ==
482                                         hp_info[i].num_pages[socket])
483                                 continue;
484                         /* At this point we know that there are more pages
485                          * available that are bigger than the memory we want,
486                          * so lets see if we can get enough from other page
487                          * sizes.
488                          */
489                         remaining_mem = 0;
490                         for (j = i+1; j < num_hp_info; j++)
491                                 remaining_mem += hp_info[j].hugepage_sz *
492                                 hp_info[j].num_pages[socket];
493
494                         /* Is there enough other memory?
495                          * If not, allocate another page and quit.
496                          */
497                         if (remaining_mem < memory[socket]) {
498                                 cur_mem = RTE_MIN(
499                                         memory[socket], hp_info[i].hugepage_sz);
500                                 memory[socket] -= cur_mem;
501                                 total_mem -= cur_mem;
502                                 hp_used[i].num_pages[socket]++;
503                                 total_num_pages++;
504                                 break; /* we are done with this socket*/
505                         }
506                 }
507
508                 /* if we didn't satisfy all memory requirements per socket */
509                 if (memory[socket] > 0 &&
510                                 internal_conf->socket_mem[socket] != 0) {
511                         /* to prevent icc errors */
512                         requested = (unsigned int)(
513                                 internal_conf->socket_mem[socket] / 0x100000);
514                         available = requested -
515                                 ((unsigned int)(memory[socket] / 0x100000));
516                         RTE_LOG(ERR, EAL, "Not enough memory available on "
517                                 "socket %u! Requested: %uMB, available: %uMB\n",
518                                 socket, requested, available);
519                         return -1;
520                 }
521         }
522
523         /* if we didn't satisfy total memory requirements */
524         if (total_mem > 0) {
525                 requested = (unsigned int)(internal_conf->memory / 0x100000);
526                 available = requested - (unsigned int)(total_mem / 0x100000);
527                 RTE_LOG(ERR, EAL, "Not enough memory available! "
528                         "Requested: %uMB, available: %uMB\n",
529                         requested, available);
530                 return -1;
531         }
532         return total_num_pages;
533 }