mem: extract common dynamic memory allocation
[dpdk.git] / lib / librte_eal / common / eal_common_dynmem.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2014 Intel Corporation.
3  * Copyright(c) 2013 6WIND S.A.
4  */
5
6 #include <inttypes.h>
7 #include <string.h>
8
9 #include <rte_log.h>
10 #include <rte_string_fns.h>
11
12 #include "eal_internal_cfg.h"
13 #include "eal_memalloc.h"
14 #include "eal_memcfg.h"
15 #include "eal_private.h"
16
17 /** @file Functions common to EALs that support dynamic memory allocation. */
18
19 int
20 eal_dynmem_memseg_lists_init(void)
21 {
22         struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
23         struct memtype {
24                 uint64_t page_sz;
25                 int socket_id;
26         } *memtypes = NULL;
27         int i, hpi_idx, msl_idx, ret = -1; /* fail unless told to succeed */
28         struct rte_memseg_list *msl;
29         uint64_t max_mem, max_mem_per_type;
30         unsigned int max_seglists_per_type;
31         unsigned int n_memtypes, cur_type;
32
33         /* no-huge does not need this at all */
34         if (internal_config.no_hugetlbfs)
35                 return 0;
36
37         /*
38          * figuring out amount of memory we're going to have is a long and very
39          * involved process. the basic element we're operating with is a memory
40          * type, defined as a combination of NUMA node ID and page size (so that
41          * e.g. 2 sockets with 2 page sizes yield 4 memory types in total).
42          *
43          * deciding amount of memory going towards each memory type is a
44          * balancing act between maximum segments per type, maximum memory per
45          * type, and number of detected NUMA nodes. the goal is to make sure
46          * each memory type gets at least one memseg list.
47          *
48          * the total amount of memory is limited by RTE_MAX_MEM_MB value.
49          *
50          * the total amount of memory per type is limited by either
51          * RTE_MAX_MEM_MB_PER_TYPE, or by RTE_MAX_MEM_MB divided by the number
52          * of detected NUMA nodes. additionally, maximum number of segments per
53          * type is also limited by RTE_MAX_MEMSEG_PER_TYPE. this is because for
54          * smaller page sizes, it can take hundreds of thousands of segments to
55          * reach the above specified per-type memory limits.
56          *
57          * additionally, each type may have multiple memseg lists associated
58          * with it, each limited by either RTE_MAX_MEM_MB_PER_LIST for bigger
59          * page sizes, or RTE_MAX_MEMSEG_PER_LIST segments for smaller ones.
60          *
61          * the number of memseg lists per type is decided based on the above
62          * limits, and also taking number of detected NUMA nodes, to make sure
63          * that we don't run out of memseg lists before we populate all NUMA
64          * nodes with memory.
65          *
66          * we do this in three stages. first, we collect the number of types.
67          * then, we figure out memory constraints and populate the list of
68          * would-be memseg lists. then, we go ahead and allocate the memseg
69          * lists.
70          */
71
72         /* create space for mem types */
73         n_memtypes = internal_config.num_hugepage_sizes * rte_socket_count();
74         memtypes = calloc(n_memtypes, sizeof(*memtypes));
75         if (memtypes == NULL) {
76                 RTE_LOG(ERR, EAL, "Cannot allocate space for memory types\n");
77                 return -1;
78         }
79
80         /* populate mem types */
81         cur_type = 0;
82         for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
83                         hpi_idx++) {
84                 struct hugepage_info *hpi;
85                 uint64_t hugepage_sz;
86
87                 hpi = &internal_config.hugepage_info[hpi_idx];
88                 hugepage_sz = hpi->hugepage_sz;
89
90                 for (i = 0; i < (int) rte_socket_count(); i++, cur_type++) {
91                         int socket_id = rte_socket_id_by_idx(i);
92
93 #ifndef RTE_EAL_NUMA_AWARE_HUGEPAGES
94                         /* we can still sort pages by socket in legacy mode */
95                         if (!internal_config.legacy_mem && socket_id > 0)
96                                 break;
97 #endif
98                         memtypes[cur_type].page_sz = hugepage_sz;
99                         memtypes[cur_type].socket_id = socket_id;
100
101                         RTE_LOG(DEBUG, EAL, "Detected memory type: "
102                                 "socket_id:%u hugepage_sz:%" PRIu64 "\n",
103                                 socket_id, hugepage_sz);
104                 }
105         }
106         /* number of memtypes could have been lower due to no NUMA support */
107         n_memtypes = cur_type;
108
109         /* set up limits for types */
110         max_mem = (uint64_t)RTE_MAX_MEM_MB << 20;
111         max_mem_per_type = RTE_MIN((uint64_t)RTE_MAX_MEM_MB_PER_TYPE << 20,
112                         max_mem / n_memtypes);
113         /*
114          * limit maximum number of segment lists per type to ensure there's
115          * space for memseg lists for all NUMA nodes with all page sizes
116          */
117         max_seglists_per_type = RTE_MAX_MEMSEG_LISTS / n_memtypes;
118
119         if (max_seglists_per_type == 0) {
120                 RTE_LOG(ERR, EAL, "Cannot accommodate all memory types, please increase %s\n",
121                         RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
122                 goto out;
123         }
124
125         /* go through all mem types and create segment lists */
126         msl_idx = 0;
127         for (cur_type = 0; cur_type < n_memtypes; cur_type++) {
128                 unsigned int cur_seglist, n_seglists, n_segs;
129                 unsigned int max_segs_per_type, max_segs_per_list;
130                 struct memtype *type = &memtypes[cur_type];
131                 uint64_t max_mem_per_list, pagesz;
132                 int socket_id;
133
134                 pagesz = type->page_sz;
135                 socket_id = type->socket_id;
136
137                 /*
138                  * we need to create segment lists for this type. we must take
139                  * into account the following things:
140                  *
141                  * 1. total amount of memory we can use for this memory type
142                  * 2. total amount of memory per memseg list allowed
143                  * 3. number of segments needed to fit the amount of memory
144                  * 4. number of segments allowed per type
145                  * 5. number of segments allowed per memseg list
146                  * 6. number of memseg lists we are allowed to take up
147                  */
148
149                 /* calculate how much segments we will need in total */
150                 max_segs_per_type = max_mem_per_type / pagesz;
151                 /* limit number of segments to maximum allowed per type */
152                 max_segs_per_type = RTE_MIN(max_segs_per_type,
153                                 (unsigned int)RTE_MAX_MEMSEG_PER_TYPE);
154                 /* limit number of segments to maximum allowed per list */
155                 max_segs_per_list = RTE_MIN(max_segs_per_type,
156                                 (unsigned int)RTE_MAX_MEMSEG_PER_LIST);
157
158                 /* calculate how much memory we can have per segment list */
159                 max_mem_per_list = RTE_MIN(max_segs_per_list * pagesz,
160                                 (uint64_t)RTE_MAX_MEM_MB_PER_LIST << 20);
161
162                 /* calculate how many segments each segment list will have */
163                 n_segs = RTE_MIN(max_segs_per_list, max_mem_per_list / pagesz);
164
165                 /* calculate how many segment lists we can have */
166                 n_seglists = RTE_MIN(max_segs_per_type / n_segs,
167                                 max_mem_per_type / max_mem_per_list);
168
169                 /* limit number of segment lists according to our maximum */
170                 n_seglists = RTE_MIN(n_seglists, max_seglists_per_type);
171
172                 RTE_LOG(DEBUG, EAL, "Creating %i segment lists: "
173                                 "n_segs:%i socket_id:%i hugepage_sz:%" PRIu64 "\n",
174                         n_seglists, n_segs, socket_id, pagesz);
175
176                 /* create all segment lists */
177                 for (cur_seglist = 0; cur_seglist < n_seglists; cur_seglist++) {
178                         if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
179                                 RTE_LOG(ERR, EAL,
180                                         "No more space in memseg lists, please increase %s\n",
181                                         RTE_STR(CONFIG_RTE_MAX_MEMSEG_LISTS));
182                                 goto out;
183                         }
184                         msl = &mcfg->memsegs[msl_idx++];
185
186                         if (eal_memseg_list_init(msl, pagesz, n_segs,
187                                         socket_id, cur_seglist, true))
188                                 goto out;
189
190                         if (eal_memseg_list_alloc(msl, 0)) {
191                                 RTE_LOG(ERR, EAL, "Cannot allocate VA space for memseg list\n");
192                                 goto out;
193                         }
194                 }
195         }
196         /* we're successful */
197         ret = 0;
198 out:
199         free(memtypes);
200         return ret;
201 }
202
203 static int __rte_unused
204 hugepage_count_walk(const struct rte_memseg_list *msl, void *arg)
205 {
206         struct hugepage_info *hpi = arg;
207
208         if (msl->page_sz != hpi->hugepage_sz)
209                 return 0;
210
211         hpi->num_pages[msl->socket_id] += msl->memseg_arr.len;
212         return 0;
213 }
214
215 static int
216 limits_callback(int socket_id, size_t cur_limit, size_t new_len)
217 {
218         RTE_SET_USED(socket_id);
219         RTE_SET_USED(cur_limit);
220         RTE_SET_USED(new_len);
221         return -1;
222 }
223
224 int
225 eal_dynmem_hugepage_init(void)
226 {
227         struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
228         uint64_t memory[RTE_MAX_NUMA_NODES];
229         int hp_sz_idx, socket_id;
230
231         memset(used_hp, 0, sizeof(used_hp));
232
233         for (hp_sz_idx = 0;
234                         hp_sz_idx < (int) internal_config.num_hugepage_sizes;
235                         hp_sz_idx++) {
236 #ifndef RTE_ARCH_64
237                 struct hugepage_info dummy;
238                 unsigned int i;
239 #endif
240                 /* also initialize used_hp hugepage sizes in used_hp */
241                 struct hugepage_info *hpi;
242                 hpi = &internal_config.hugepage_info[hp_sz_idx];
243                 used_hp[hp_sz_idx].hugepage_sz = hpi->hugepage_sz;
244
245 #ifndef RTE_ARCH_64
246                 /* for 32-bit, limit number of pages on socket to whatever we've
247                  * preallocated, as we cannot allocate more.
248                  */
249                 memset(&dummy, 0, sizeof(dummy));
250                 dummy.hugepage_sz = hpi->hugepage_sz;
251                 if (rte_memseg_list_walk(hugepage_count_walk, &dummy) < 0)
252                         return -1;
253
254                 for (i = 0; i < RTE_DIM(dummy.num_pages); i++) {
255                         hpi->num_pages[i] = RTE_MIN(hpi->num_pages[i],
256                                         dummy.num_pages[i]);
257                 }
258 #endif
259         }
260
261         /* make a copy of socket_mem, needed for balanced allocation. */
262         for (hp_sz_idx = 0; hp_sz_idx < RTE_MAX_NUMA_NODES; hp_sz_idx++)
263                 memory[hp_sz_idx] = internal_config.socket_mem[hp_sz_idx];
264
265         /* calculate final number of pages */
266         if (eal_dynmem_calc_num_pages_per_socket(memory,
267                         internal_config.hugepage_info, used_hp,
268                         internal_config.num_hugepage_sizes) < 0)
269                 return -1;
270
271         for (hp_sz_idx = 0;
272                         hp_sz_idx < (int)internal_config.num_hugepage_sizes;
273                         hp_sz_idx++) {
274                 for (socket_id = 0; socket_id < RTE_MAX_NUMA_NODES;
275                                 socket_id++) {
276                         struct rte_memseg **pages;
277                         struct hugepage_info *hpi = &used_hp[hp_sz_idx];
278                         unsigned int num_pages = hpi->num_pages[socket_id];
279                         unsigned int num_pages_alloc;
280
281                         if (num_pages == 0)
282                                 continue;
283
284                         RTE_LOG(DEBUG, EAL,
285                                 "Allocating %u pages of size %" PRIu64 "M "
286                                 "on socket %i\n",
287                                 num_pages, hpi->hugepage_sz >> 20, socket_id);
288
289                         /* we may not be able to allocate all pages in one go,
290                          * because we break up our memory map into multiple
291                          * memseg lists. therefore, try allocating multiple
292                          * times and see if we can get the desired number of
293                          * pages from multiple allocations.
294                          */
295
296                         num_pages_alloc = 0;
297                         do {
298                                 int i, cur_pages, needed;
299
300                                 needed = num_pages - num_pages_alloc;
301
302                                 pages = malloc(sizeof(*pages) * needed);
303
304                                 /* do not request exact number of pages */
305                                 cur_pages = eal_memalloc_alloc_seg_bulk(pages,
306                                                 needed, hpi->hugepage_sz,
307                                                 socket_id, false);
308                                 if (cur_pages <= 0) {
309                                         free(pages);
310                                         return -1;
311                                 }
312
313                                 /* mark preallocated pages as unfreeable */
314                                 for (i = 0; i < cur_pages; i++) {
315                                         struct rte_memseg *ms = pages[i];
316                                         ms->flags |=
317                                                 RTE_MEMSEG_FLAG_DO_NOT_FREE;
318                                 }
319                                 free(pages);
320
321                                 num_pages_alloc += cur_pages;
322                         } while (num_pages_alloc != num_pages);
323                 }
324         }
325
326         /* if socket limits were specified, set them */
327         if (internal_config.force_socket_limits) {
328                 unsigned int i;
329                 for (i = 0; i < RTE_MAX_NUMA_NODES; i++) {
330                         uint64_t limit = internal_config.socket_limit[i];
331                         if (limit == 0)
332                                 continue;
333                         if (rte_mem_alloc_validator_register("socket-limit",
334                                         limits_callback, i, limit))
335                                 RTE_LOG(ERR, EAL, "Failed to register socket limits validator callback\n");
336                 }
337         }
338         return 0;
339 }
340
341 __rte_unused /* function is unused on 32-bit builds */
342 static inline uint64_t
343 get_socket_mem_size(int socket)
344 {
345         uint64_t size = 0;
346         unsigned int i;
347
348         for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
349                 struct hugepage_info *hpi = &internal_config.hugepage_info[i];
350                 size += hpi->hugepage_sz * hpi->num_pages[socket];
351         }
352
353         return size;
354 }
355
356 int
357 eal_dynmem_calc_num_pages_per_socket(
358         uint64_t *memory, struct hugepage_info *hp_info,
359         struct hugepage_info *hp_used, unsigned int num_hp_info)
360 {
361         unsigned int socket, j, i = 0;
362         unsigned int requested, available;
363         int total_num_pages = 0;
364         uint64_t remaining_mem, cur_mem;
365         uint64_t total_mem = internal_config.memory;
366
367         if (num_hp_info == 0)
368                 return -1;
369
370         /* if specific memory amounts per socket weren't requested */
371         if (internal_config.force_sockets == 0) {
372                 size_t total_size;
373 #ifdef RTE_ARCH_64
374                 int cpu_per_socket[RTE_MAX_NUMA_NODES];
375                 size_t default_size;
376                 unsigned int lcore_id;
377
378                 /* Compute number of cores per socket */
379                 memset(cpu_per_socket, 0, sizeof(cpu_per_socket));
380                 RTE_LCORE_FOREACH(lcore_id) {
381                         cpu_per_socket[rte_lcore_to_socket_id(lcore_id)]++;
382                 }
383
384                 /*
385                  * Automatically spread requested memory amongst detected
386                  * sockets according to number of cores from CPU mask present
387                  * on each socket.
388                  */
389                 total_size = internal_config.memory;
390                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
391                                 socket++) {
392
393                         /* Set memory amount per socket */
394                         default_size = internal_config.memory *
395                                 cpu_per_socket[socket] / rte_lcore_count();
396
397                         /* Limit to maximum available memory on socket */
398                         default_size = RTE_MIN(
399                                 default_size, get_socket_mem_size(socket));
400
401                         /* Update sizes */
402                         memory[socket] = default_size;
403                         total_size -= default_size;
404                 }
405
406                 /*
407                  * If some memory is remaining, try to allocate it by getting
408                  * all available memory from sockets, one after the other.
409                  */
410                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
411                                 socket++) {
412                         /* take whatever is available */
413                         default_size = RTE_MIN(
414                                 get_socket_mem_size(socket) - memory[socket],
415                                 total_size);
416
417                         /* Update sizes */
418                         memory[socket] += default_size;
419                         total_size -= default_size;
420                 }
421 #else
422                 /* in 32-bit mode, allocate all of the memory only on master
423                  * lcore socket
424                  */
425                 total_size = internal_config.memory;
426                 for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_size != 0;
427                                 socket++) {
428                         struct rte_config *cfg = rte_eal_get_configuration();
429                         unsigned int master_lcore_socket;
430
431                         master_lcore_socket =
432                                 rte_lcore_to_socket_id(cfg->master_lcore);
433
434                         if (master_lcore_socket != socket)
435                                 continue;
436
437                         /* Update sizes */
438                         memory[socket] = total_size;
439                         break;
440                 }
441 #endif
442         }
443
444         for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0;
445                         socket++) {
446                 /* skips if the memory on specific socket wasn't requested */
447                 for (i = 0; i < num_hp_info && memory[socket] != 0; i++) {
448                         rte_strscpy(hp_used[i].hugedir, hp_info[i].hugedir,
449                                 sizeof(hp_used[i].hugedir));
450                         hp_used[i].num_pages[socket] = RTE_MIN(
451                                         memory[socket] / hp_info[i].hugepage_sz,
452                                         hp_info[i].num_pages[socket]);
453
454                         cur_mem = hp_used[i].num_pages[socket] *
455                                         hp_used[i].hugepage_sz;
456
457                         memory[socket] -= cur_mem;
458                         total_mem -= cur_mem;
459
460                         total_num_pages += hp_used[i].num_pages[socket];
461
462                         /* check if we have met all memory requests */
463                         if (memory[socket] == 0)
464                                 break;
465
466                         /* Check if we have any more pages left at this size,
467                          * if so, move on to next size.
468                          */
469                         if (hp_used[i].num_pages[socket] ==
470                                         hp_info[i].num_pages[socket])
471                                 continue;
472                         /* At this point we know that there are more pages
473                          * available that are bigger than the memory we want,
474                          * so lets see if we can get enough from other page
475                          * sizes.
476                          */
477                         remaining_mem = 0;
478                         for (j = i+1; j < num_hp_info; j++)
479                                 remaining_mem += hp_info[j].hugepage_sz *
480                                 hp_info[j].num_pages[socket];
481
482                         /* Is there enough other memory?
483                          * If not, allocate another page and quit.
484                          */
485                         if (remaining_mem < memory[socket]) {
486                                 cur_mem = RTE_MIN(
487                                         memory[socket], hp_info[i].hugepage_sz);
488                                 memory[socket] -= cur_mem;
489                                 total_mem -= cur_mem;
490                                 hp_used[i].num_pages[socket]++;
491                                 total_num_pages++;
492                                 break; /* we are done with this socket*/
493                         }
494                 }
495
496                 /* if we didn't satisfy all memory requirements per socket */
497                 if (memory[socket] > 0 &&
498                                 internal_config.socket_mem[socket] != 0) {
499                         /* to prevent icc errors */
500                         requested = (unsigned int)(
501                                 internal_config.socket_mem[socket] / 0x100000);
502                         available = requested -
503                                 ((unsigned int)(memory[socket] / 0x100000));
504                         RTE_LOG(ERR, EAL, "Not enough memory available on "
505                                 "socket %u! Requested: %uMB, available: %uMB\n",
506                                 socket, requested, available);
507                         return -1;
508                 }
509         }
510
511         /* if we didn't satisfy total memory requirements */
512         if (total_mem > 0) {
513                 requested = (unsigned int)(internal_config.memory / 0x100000);
514                 available = requested - (unsigned int)(total_mem / 0x100000);
515                 RTE_LOG(ERR, EAL, "Not enough memory available! "
516                         "Requested: %uMB, available: %uMB\n",
517                         requested, available);
518                 return -1;
519         }
520         return total_num_pages;
521 }