From: Intel Date: Wed, 19 Dec 2012 23:00:00 +0000 (+0100) Subject: memory: add --socket-mem option X-Git-Tag: spdx-start~11385 X-Git-Url: http://git.droids-corp.org/?a=commitdiff_plain;h=b6a468ad41d5;p=dpdk.git memory: add --socket-mem option On NUMA systems, --socket-mem makes it possible to select the node where allocations will go. Signed-off-by: Intel --- diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c index 9096a5d6ca..61dc70a93e 100644 --- a/lib/librte_eal/linuxapp/eal/eal.c +++ b/lib/librte_eal/linuxapp/eal/eal.c @@ -80,11 +80,14 @@ #define OPT_NO_PCI "no-pci" #define OPT_NO_HUGE "no-huge" #define OPT_FILE_PREFIX "file-prefix" +#define OPT_SOCKET_MEM "socket-mem" #define RTE_EAL_BLACKLIST_SIZE 0x100 #define MEMSIZE_IF_NO_HUGE_PAGE (64ULL * 1024ULL * 1024ULL) +#define SOCKET_MEM_STRLEN (RTE_MAX_NUMA_NODES * 10) + #define GET_BLACKLIST_FIELD(in, fd, lim, dlm) \ { \ unsigned long val; \ @@ -293,6 +296,8 @@ eal_usage(const char *prgname) " (multiple -b options are allowed)\n" " -m MB : memory to allocate (see also --"OPT_SOCKET_MEM")\n" " -r NUM : force number of memory ranks (don't detect)\n" + " --"OPT_SOCKET_MEM" : memory to allocate on specific \n" + " sockets (use comma separated values)\n" " --"OPT_HUGE_DIR" : directory where hugetlbfs is mounted\n" " --"OPT_PROC_TYPE" : type of this process\n" " --"OPT_FILE_PREFIX": prefix for hugepage filenames\n" @@ -339,16 +344,69 @@ eal_parse_coremask(const char *coremask) return 0; } +static int +eal_parse_socket_mem(char *socket_mem) +{ + char * arg[RTE_MAX_NUMA_NODES]; + char *end; + int arg_num, i, len; + uint64_t total_mem = 0; + + len = strnlen(socket_mem, SOCKET_MEM_STRLEN); + if (len == SOCKET_MEM_STRLEN) { + RTE_LOG(ERR, EAL, "--socket-mem is too long\n"); + return -1; + } + + /* all other error cases will be caught later */ + if (!isdigit(socket_mem[len-1])) + return -1; + + /* split the optarg into separate socket values */ + arg_num = rte_strsplit(socket_mem, len, + arg, RTE_MAX_NUMA_NODES, ','); + + /* if split failed, or 0 arguments */ + if (arg_num <= 0) + return -1; + + internal_config.force_sockets = 1; + + /* parse each defined socket option */ + errno = 0; + for (i = 0; i < arg_num; i++) { + end = NULL; + internal_config.socket_mem[i] = strtoull(arg[i], &end, 10); + + /* check for invalid input */ + if ((errno != 0) || + (arg[i][0] == '\0') || (end == NULL) || (*end != '\0')) + return -1; + internal_config.socket_mem[i] *= 1024ULL; + internal_config.socket_mem[i] *= 1024ULL; + total_mem += internal_config.socket_mem[i]; + } + + /* check if we have a positive amount of total memory */ + if (total_mem == 0) + return -1; + + return 0; +} + static inline uint64_t eal_get_hugepage_mem_size(void) { uint64_t size = 0; - unsigned i; + unsigned i, j; - for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + for (i = 0; i < internal_config.num_hugepage_sizes; i++) { struct hugepage_info *hpi = &internal_config.hugepage_info[i]; - if (hpi->hugedir != NULL) - size += hpi->hugepage_sz * hpi->num_pages; + if (hpi->hugedir != NULL) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + size += hpi->hugepage_sz * hpi->num_pages[j]; + } + } } return (size); @@ -401,7 +459,7 @@ eal_parse_blacklist_opt(const char *optarg, size_t idx) static int eal_parse_args(int argc, char **argv) { - int opt, ret; + int opt, ret, i; char **argvopt; int option_index; int coremask_ok = 0; @@ -415,6 +473,7 @@ eal_parse_args(int argc, char **argv) {OPT_NO_SHCONF, 0, 0, 0}, {OPT_PROC_TYPE, 1, 0, 0}, {OPT_FILE_PREFIX, 1, 0, 0}, + {OPT_SOCKET_MEM, 1, 0, 0}, {0, 0, 0, 0} }; @@ -425,11 +484,15 @@ eal_parse_args(int argc, char **argv) internal_config.force_nchannel = 0; internal_config.hugefile_prefix = HUGEFILE_PREFIX_DEFAULT; internal_config.hugepage_dir = NULL; + internal_config.force_sockets = 0; #ifdef RTE_LIBEAL_USE_HPET internal_config.no_hpet = 0; #else internal_config.no_hpet = 1; #endif + /* zero out the NUMA config */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_config.socket_mem[i] = 0; while ((opt = getopt_long(argc, argvopt, "b:c:m:n:r:v", lgopts, &option_index)) != EOF) { @@ -508,6 +571,14 @@ eal_parse_args(int argc, char **argv) else if (!strcmp(lgopts[option_index].name, OPT_FILE_PREFIX)) { internal_config.hugefile_prefix = optarg; } + else if (!strcmp(lgopts[option_index].name, OPT_SOCKET_MEM)) { + if (eal_parse_socket_mem(optarg) < 0) { + RTE_LOG(ERR, EAL, "invalid parameters for --" + OPT_SOCKET_MEM "\n"); + eal_usage(prgname); + return -1; + } + } break; default: @@ -541,6 +612,21 @@ eal_parse_args(int argc, char **argv) eal_usage(prgname); return -1; } + if (internal_config.memory > 0 && internal_config.force_sockets == 1) { + RTE_LOG(ERR, EAL, "Options -m and --socket-mem cannot be specified " + "at the same time\n"); + eal_usage(prgname); + return -1; + } + /* --no-huge doesn't make sense with either -m or --socket-mem */ + if (internal_config.no_hugetlbfs && + (internal_config.memory > 0 || + internal_config.force_sockets == 1)) { + RTE_LOG(ERR, EAL, "Options -m or --socket-mem cannot be specified " + "together with --no-huge!\n"); + eal_usage(prgname); + return -1; + } if (blacklist_index > 0) rte_eal_pci_set_blacklist(eal_dev_blacklist, blacklist_index); @@ -548,11 +634,35 @@ eal_parse_args(int argc, char **argv) if (optind >= 0) argv[optind-1] = prgname; + /* if no memory amounts were requested, this will result in 0 and + * will be overriden later, right after eal_hugepage_info_init() */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + internal_config.memory += internal_config.socket_mem[i]; + ret = optind-1; optind = 0; /* reset getopt lib */ return ret; } +static void +eal_check_mem_on_local_socket(void) +{ + const struct rte_memseg *ms; + int i, socket_id; + + socket_id = rte_lcore_to_socket_id(rte_config.master_lcore); + + ms = rte_eal_get_physmem_layout(); + + for (i = 0; i < RTE_MAX_MEMSEG; i++) + if (ms[i].socket_id == socket_id && + ms[i].len > 0) + return; + + RTE_LOG(WARNING, EAL, "WARNING: Master core has no " + "memory on local socket!\n"); +} + /* Launch threads, called at application init(). */ int rte_eal_init(int argc, char **argv) @@ -572,7 +682,7 @@ rte_eal_init(int argc, char **argv) if (eal_hugepage_info_init() < 0) rte_panic("Cannot get hugepage information\n"); - if (internal_config.memory == 0) { + if (internal_config.memory == 0 && internal_config.force_sockets == 0) { if (internal_config.no_hugetlbfs) internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE; else @@ -612,6 +722,8 @@ rte_eal_init(int argc, char **argv) RTE_LOG(DEBUG, EAL, "Master core %u is ready (tid=%x)\n", rte_config.master_lcore, (int)thread_id); + eal_check_mem_on_local_socket(); + RTE_LCORE_FOREACH_SLAVE(i) { /* diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c index 5632ecee9c..5a0e51a7ff 100644 --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c @@ -174,6 +174,11 @@ swap_hpi(struct hugepage_info *a, struct hugepage_info *b) memcpy(b, buf, sizeof(*a)); } +/* + * when we initialize the hugepage info, everything goes + * to socket 0 by default. it will later get sorted by memory + * initialization procedure. + */ int eal_hugepage_info_init(void) { @@ -192,16 +197,27 @@ eal_hugepage_info_init(void) struct hugepage_info *hpi = \ &internal_config.hugepage_info[num_sizes]; hpi->hugepage_sz = rte_str_to_size(&dirent->d_name[dirent_start_len]); - hpi->num_pages = get_num_hugepages(dirent->d_name); hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz); + + /* first, check if we have a mountpoint */ if (hpi->hugedir == NULL){ RTE_LOG(INFO, EAL, "%u hugepages of size %llu reserved, "\ "but no mounted hugetlbfs found for that size\n", - hpi->num_pages, + (unsigned) get_num_hugepages(dirent->d_name), (unsigned long long)hpi->hugepage_sz); - hpi->num_pages = 0; - } else + } else { + /* for now, put all pages into socket 0, + * later they will be sorted */ + hpi->num_pages[0] = get_num_hugepages(dirent->d_name); + +#ifndef RTE_ARCH_X86_64 + /* for 32-bit systems, limit number of hugepages to 1GB per page size */ + hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0], + RTE_PGSIZE_1G / hpi->hugepage_sz); +#endif + num_sizes++; + } } dirent = readdir(dir); } @@ -221,8 +237,9 @@ eal_hugepage_info_init(void) /* now we have all info, check we have at least one valid size */ for (i = 0; i < num_sizes; i++) if (internal_config.hugepage_info[i].hugedir != NULL && - internal_config.hugepage_info[i].num_pages > 0) + internal_config.hugepage_info[i].num_pages[0] > 0) return 0; + /* no valid hugepage mounts available, return error */ return -1; } diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 511c0a7258..7357a128fb 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -176,7 +176,7 @@ map_all_hugepages(struct hugepage *hugepg_tbl, void *vma_addr = NULL; uint64_t vma_len = 0; - for (i = 0; i < hpi->num_pages; i++) { + for (i = 0; i < hpi->num_pages[0]; i++) { uint64_t hugepage_sz = hpi->hugepage_sz; if (orig) { @@ -203,7 +203,7 @@ map_all_hugepages(struct hugepage *hugepg_tbl, /* reserve a virtual area for next contiguous * physical block: count the number of * contiguous physical pages. */ - for (j = i+1; j < hpi->num_pages ; j++) { + for (j = i+1; j < hpi->num_pages[0] ; j++) { if (hugepg_tbl[j].physaddr != hugepg_tbl[j-1].physaddr + hugepage_sz) break; @@ -255,7 +255,7 @@ static int unmap_all_hugepages_orig(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) { unsigned i; - for (i = 0; i < hpi->num_pages; i++) { + for (i = 0; i < hpi->num_pages[0]; i++) { if (hugepg_tbl[i].orig_va) { munmap(hugepg_tbl[i].orig_va, hpi->hugepage_sz); hugepg_tbl[i].orig_va = NULL; @@ -287,7 +287,7 @@ find_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) return -1; } - for (i = 0; i < hpi->num_pages; i++) { + for (i = 0; i < hpi->num_pages[0]; i++) { off_t offset; virt_pfn = (unsigned long)hugepg_tbl[i].orig_va / page_size; @@ -377,7 +377,7 @@ find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) } /* if we find this page in our mappings, set socket_id */ - for (i = 0; i < hpi->num_pages; i++) { + for (i = 0; i < hpi->num_pages[0]; i++) { void *va = (void *)(unsigned long)virt_addr; if (hugepg_tbl[i].orig_va == va) { hugepg_tbl[i].socket_id = socket_id; @@ -385,8 +385,10 @@ find_numasocket(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) } } } - if (hp_count < hpi->num_pages) + + if (hp_count < hpi->num_pages[0]) goto error; + fclose(f); return 0; @@ -408,7 +410,7 @@ sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) uint64_t smallest_addr; struct hugepage tmp; - for (i = 0; i < hpi->num_pages; i++) { + for (i = 0; i < hpi->num_pages[0]; i++) { smallest_addr = 0; smallest_idx = -1; @@ -416,7 +418,7 @@ sort_by_physaddr(struct hugepage *hugepg_tbl, struct hugepage_info *hpi) * browse all entries starting at 'i', and find the * entry with the smallest addr */ - for (j=i; jnum_pages; j++) { + for (j=i; j< hpi->num_pages[0]; j++) { if (smallest_addr == 0 || hugepg_tbl[j].physaddr < smallest_addr) { @@ -461,54 +463,180 @@ create_shared_memory(const char *filename, const size_t mem_size) } /* - * This function takes in the list of hugepage sizes and the + * this copies *active* hugepages from one hugepage table to another. + * destination is typically the shared memory. + */ +static int +copy_hugepages_to_shared_mem(struct hugepage * dst, int dest_size, + const struct hugepage * src, int src_size) +{ + int src_pos, dst_pos = 0; + + for (src_pos = 0; src_pos < src_size; src_pos++) { + if (src[src_pos].final_va != NULL) { + /* error on overflow attempt */ + if (dst_pos == dest_size) + return -1; + memcpy(&dst[dst_pos], &src[src_pos], sizeof(struct hugepage)); + dst_pos++; + } + } + return 0; +} + +/* + * unmaps hugepages that are not going to be used. since we originally allocate + * ALL hugepages (not just those we need), additional unmapping needs to be done. + */ +static int +unmap_unneeded_hugepages(struct hugepage *hugepg_tbl, + struct hugepage_info *hpi, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + int fd; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += internal_config.hugepage_info[size].num_pages[socket]; + + for (size = 0; size < num_hp_info; size++) { + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) { + unsigned pages_found = 0; + /* traverse until we have unmapped all the unused pages */ + for (page = 0; page < nrpages; page++) { + struct hugepage *hp = &hugepg_tbl[page]; + + /* find a page that matches the criteria */ + if ((hp->size == hpi[size].hugepage_sz) && + (hp->socket_id == (int) socket)) { + + /* if we skipped enough pages, unmap the rest */ + if (pages_found == hpi[size].num_pages[socket]) { + munmap(hp->final_va, hp->size); + hp->final_va = NULL; + } + else { + pages_found++; + } + } /* match page */ + } /* foreach page */ + } /* foreach socket */ + } /* foreach pagesize */ + + return 0; +} + +static inline uint64_t +get_socket_mem_size(int socket) +{ + uint64_t size = 0; + unsigned i; + + for (i = 0; i < internal_config.num_hugepage_sizes; i++){ + struct hugepage_info *hpi = &internal_config.hugepage_info[i]; + if (hpi->hugedir != NULL) + size += hpi->hugepage_sz * hpi->num_pages[socket]; + } + + return (size); +} + +/* + * This function is a NUMA-aware equivalent of calc_num_pages. + * It takes in the list of hugepage sizes and the * number of pages thereof, and calculates the best number of * pages of each size to fulfill the request for ram */ static int -calc_num_pages(uint64_t memory, +calc_num_pages_per_socket(uint64_t * memory, struct hugepage_info *hp_info, struct hugepage_info *hp_used, unsigned num_hp_info) { - unsigned i = 0; + unsigned socket, j, i = 0; + unsigned requested, available; int total_num_pages = 0; + uint64_t remaining_mem, cur_mem; + uint64_t total_mem = internal_config.memory; + if (num_hp_info == 0) return -1; - for (i = 0; i < num_hp_info; i++){ - hp_used[i].hugepage_sz = hp_info[i].hugepage_sz; - hp_used[i].hugedir = hp_info[i].hugedir; - hp_used[i].num_pages = RTE_MIN(memory / hp_info[i].hugepage_sz, - hp_info[i].num_pages); - - memory -= hp_used[i].num_pages * hp_used[i].hugepage_sz; - total_num_pages += hp_used[i].num_pages; - - /* check if we have met all memory requests */ - if (memory == 0) - break; - /* check if we have any more pages left at this size, if so - * move on to next size */ - if (hp_used[i].num_pages == hp_info[i].num_pages) - continue; - /* At this point we know that there are more pages available that are - * bigger than the memory we want, so lets see if we can get enough - * from other page sizes. - */ - unsigned j; - uint64_t remaining_mem = 0; - for (j = i+1; j < num_hp_info; j++) - remaining_mem += hp_info[j].hugepage_sz * hp_info[j].num_pages; - - /* is there enough other memory, if not allocate another page and quit*/ - if (remaining_mem < memory){ - memory -= hp_info[i].hugepage_sz; - hp_used[i].num_pages++; - total_num_pages++; - break; /* we are done */ + for (socket = 0; socket < RTE_MAX_NUMA_NODES && total_mem != 0; socket++) { + /* if specific memory amounts per socket weren't requested */ + if (internal_config.force_sockets == 0) { + /* take whatever is available */ + memory[socket] = RTE_MIN(get_socket_mem_size(socket), + total_mem); + } + /* skips if the memory on specific socket wasn't requested */ + for (i = 0; i < num_hp_info && memory[socket] != 0; i++){ + hp_used[i].hugedir = hp_info[i].hugedir; + hp_used[i].num_pages[socket] = RTE_MIN( + memory[socket] / hp_info[i].hugepage_sz, + hp_info[i].num_pages[socket]); + + cur_mem = hp_used[i].num_pages[socket] * + hp_used[i].hugepage_sz; + + memory[socket] -= cur_mem; + total_mem -= cur_mem; + + total_num_pages += hp_used[i].num_pages[socket]; + + /* check if we have met all memory requests */ + if (memory[socket] == 0) + break; + + /* check if we have any more pages left at this size, if so + * move on to next size */ + if (hp_used[i].num_pages[socket] == hp_info[i].num_pages[socket]) + continue; + /* At this point we know that there are more pages available that are + * bigger than the memory we want, so lets see if we can get enough + * from other page sizes. + */ + remaining_mem = 0; + for (j = i+1; j < num_hp_info; j++) + remaining_mem += hp_info[j].hugepage_sz * + hp_info[j].num_pages[socket]; + + /* is there enough other memory, if not allocate another page and quit */ + if (remaining_mem < memory[socket]){ + cur_mem = RTE_MIN(memory[socket], + hp_info[i].hugepage_sz); + memory[socket] -= cur_mem; + total_mem -= cur_mem; + hp_used[i].num_pages[socket]++; + total_num_pages++; + break; /* we are done with this socket*/ + } + } + /* if we didn't satisfy all memory requirements per socket */ + if (memory[socket] > 0) { + /* to prevent icc errors */ + requested = (unsigned) (internal_config.socket_mem[socket] / + 0x100000); + available = requested - + ((unsigned) (memory[socket] / 0x100000)); + RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! " + "Requested: %uMB, available: %uMB\n", socket, + requested, available); + return -1; } } + + /* if we didn't satisfy total memory requirements */ + if (total_mem > 0) { + requested = (unsigned) (internal_config.memory / 0x100000); + available = requested - (unsigned) (total_mem / 0x100000); + RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB," + " available: %uMB\n", requested, available); + return -1; + } return total_num_pages; } @@ -527,10 +655,14 @@ static int rte_eal_hugepage_init(void) { struct rte_mem_config *mcfg; - struct hugepage *hugepage; + struct hugepage *hugepage, *tmp_hp = NULL; struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES]; + + uint64_t memory[RTE_MAX_NUMA_NODES]; + + unsigned hp_offset; int i, j, new_memseg; - int nrpages; + int nrpages, total_pages = 0; void *addr; memset(used_hp, 0, sizeof(used_hp)); @@ -541,66 +673,169 @@ rte_eal_hugepage_init(void) /* for debug purposes, hugetlbfs can be disabled */ if (internal_config.no_hugetlbfs) { addr = malloc(internal_config.memory); - mcfg->memseg[0].phys_addr = (unsigned long)addr; + mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr; mcfg->memseg[0].addr = addr; mcfg->memseg[0].len = internal_config.memory; mcfg->memseg[0].socket_id = 0; return 0; } - nrpages = calc_num_pages(internal_config.memory, - &internal_config.hugepage_info[0], &used_hp[0], - internal_config.num_hugepage_sizes); - for (i = 0; i < (int)internal_config.num_hugepage_sizes; i++) - RTE_LOG(INFO, EAL, "Requesting %u pages of size %"PRIu64"\n", - used_hp[i].num_pages, used_hp[i].hugepage_sz); - hugepage = create_shared_memory(eal_hugepage_info_path(), - nrpages * sizeof(struct hugepage)); - if (hugepage == NULL) - return -1; - memset(hugepage, 0, nrpages * sizeof(struct hugepage)); + /* calculate total number of hugepages available. at this point we haven't + * yet started sorting them so they all are on socket 0 */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + /* meanwhile, also initialize used_hp hugepage sizes in used_hp */ + used_hp[i].hugepage_sz = internal_config.hugepage_info[i].hugepage_sz; + + total_pages += internal_config.hugepage_info[i].num_pages[0]; + } + + /* + * allocate a memory area for hugepage table. + * this isn't shared memory yet. due to the fact that we need some + * processing done on these pages, shared memory will be created + * at a later stage. + */ + tmp_hp = malloc(total_pages * sizeof(struct hugepage)); + if (tmp_hp == NULL) + goto fail; + + memset(tmp_hp, 0, total_pages * sizeof(struct hugepage)); - unsigned hp_offset = 0; /* where we start the current page size entries */ + hp_offset = 0; /* where we start the current page size entries */ + + /* map all hugepages and sort them */ for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ - struct hugepage_info *hpi = &used_hp[i]; + struct hugepage_info *hpi; + + /* + * we don't yet mark hugepages as used at this stage, so + * we just map all hugepages available to the system + * all hugepages are still located on socket 0 + */ + hpi = &internal_config.hugepage_info[i]; + if (hpi->num_pages == 0) continue; - if (map_all_hugepages(&hugepage[hp_offset], hpi, 1) < 0){ + /* map all hugepages available */ + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){ RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n", (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; } - if (find_physaddr(&hugepage[hp_offset], hpi) < 0){ + /* find physical addresses and sockets for each hugepage */ + if (find_physaddr(&tmp_hp[hp_offset], hpi) < 0){ RTE_LOG(DEBUG, EAL, "Failed to find phys addr for %u MB pages\n", (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; } - if (find_numasocket(&hugepage[hp_offset], hpi) < 0){ + if (find_numasocket(&tmp_hp[hp_offset], hpi) < 0){ RTE_LOG(DEBUG, EAL, "Failed to find NUMA socket for %u MB pages\n", (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; } - if (sort_by_physaddr(&hugepage[hp_offset], hpi) < 0) + if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0) goto fail; - if (map_all_hugepages(&hugepage[hp_offset], hpi, 0) < 0){ + /* remap all hugepages */ + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){ RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n", (unsigned)(hpi->hugepage_sz / 0x100000)); goto fail; } - if (unmap_all_hugepages_orig(&hugepage[hp_offset], hpi) < 0) + /* unmap original mappings */ + if (unmap_all_hugepages_orig(&tmp_hp[hp_offset], hpi) < 0) goto fail; /* we have processed a num of hugepages of this size, so inc offset */ - hp_offset += hpi->num_pages; + hp_offset += hpi->num_pages[0]; + } + + /* clean out the numbers of pages */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) + internal_config.hugepage_info[i].num_pages[j] = 0; + + /* get hugepages for each socket */ + for (i = 0; i < total_pages; i++) { + int socket = tmp_hp[i].socket_id; + + /* find a hugepage info with right size and increment num_pages */ + for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) { + if (tmp_hp[i].size == + internal_config.hugepage_info[j].hugepage_sz) { + internal_config.hugepage_info[j].num_pages[socket]++; + } + } + } + + /* make a copy of socket_mem, needed for number of pages calculation */ + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) + memory[i] = internal_config.socket_mem[i]; + + /* calculate final number of pages */ + nrpages = calc_num_pages_per_socket(memory, + internal_config.hugepage_info, used_hp, + internal_config.num_hugepage_sizes); + + /* error if not enough memory available */ + if (nrpages < 0) + goto fail; + + /* reporting in! */ + for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { + for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { + if (used_hp[i].num_pages[j] > 0) { + RTE_LOG(INFO, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); + } + } + } + + /* create shared memory */ + hugepage = create_shared_memory(eal_hugepage_info_path(), + nrpages * sizeof(struct hugepage)); + + if (hugepage == NULL) { + RTE_LOG(ERR, EAL, "Failed to create shared memory!\n"); + goto fail; } + /* + * unmap pages that we won't need (looks at used_hp). + * also, sets final_va to NULL on pages that were unmapped. + */ + if (unmap_unneeded_hugepages(tmp_hp, used_hp, + internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unmapping and locking hugepages failed!\n"); + goto fail; + } + + /* + * copy stuff from malloc'd hugepage* to the actual shared memory. + * this procedure only copies those hugepages that have final_va + * not NULL. has overflow protection. + */ + if (copy_hugepages_to_shared_mem(hugepage, nrpages, + tmp_hp, total_pages) < 0) { + RTE_LOG(ERR, EAL, "Copying tables to shared memory failed!\n"); + goto fail; + } + + /* free the temporary hugepage table */ + free(tmp_hp); + tmp_hp = NULL; + memset(mcfg->memseg, 0, sizeof(mcfg->memseg)); j = -1; for (i = 0; i < nrpages; i++) { @@ -614,10 +849,10 @@ rte_eal_hugepage_init(void) else if (hugepage[i].size != hugepage[i-1].size) new_memseg = 1; else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != - hugepage[i].size) + hugepage[i].size) new_memseg = 1; else if (((unsigned long)hugepage[i].final_va - - (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) + (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) new_memseg = 1; if (new_memseg) { @@ -641,7 +876,9 @@ rte_eal_hugepage_init(void) return 0; - fail: +fail: + if (tmp_hp) + free(tmp_hp); return -1; } @@ -783,6 +1020,7 @@ rte_eal_memdevice_init(void) int rte_eal_memory_init(void) { + RTE_LOG(INFO, EAL, "Setting up hugepage memory...\n"); const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? rte_eal_hugepage_init() : rte_eal_hugepage_attach(); diff --git a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h index fc7da3edb5..d25a47b720 100644 --- a/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h +++ b/lib/librte_eal/linuxapp/eal/include/eal_internal_cfg.h @@ -40,6 +40,8 @@ #ifndef _EAL_LINUXAPP_INTERNAL_CFG #define _EAL_LINUXAPP_INTERNAL_CFG +#include + #define MAX_HUGEPAGE_SIZES 3 /**< support up to 3 page sizes */ /* @@ -49,7 +51,8 @@ struct hugepage_info { uint64_t hugepage_sz; /**< size of a huge page */ const char *hugedir; /**< dir where hugetlbfs is mounted */ - uint32_t num_pages; /**< number of hugepages of that size */ + uint32_t num_pages[RTE_MAX_NUMA_NODES]; + /**< number of hugepages of that size on each socket */ }; /** @@ -64,6 +67,9 @@ struct internal_config { volatile unsigned no_hpet; /**< true to disable HPET */ volatile unsigned no_shconf; /**< true if there is no shared config */ volatile enum rte_proc_type_t process_type; /* multi-process proc type */ + /* true to try allocating memory on specific sockets */ + volatile unsigned force_sockets; + volatile uint64_t socket_mem[RTE_MAX_NUMA_NODES]; /**< amount of memory per socket*/ const char *hugefile_prefix; /**< the base filename of hugetlbfs files */ const char *hugepage_dir; /**< specific hugetlbfs directory to use */