X-Git-Url: http://git.droids-corp.org/?a=blobdiff_plain;f=lib%2Flibrte_eal%2Flinuxapp%2Feal%2Feal_memory.c;h=79d1d2dd8033b0892f7415736ec4cfa1f6d193c2;hb=bd6aa172cf35046e197b3a42a79069d4da15813a;hp=8d1edd911b80c098d51ccd3379ad6028ceaeea8f;hpb=0748be2cf9a216fb8cd529b17fa9128839d6c025;p=dpdk.git diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c index 8d1edd911b..79d1d2dd80 100644 --- a/lib/librte_eal/linuxapp/eal/eal_memory.c +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c @@ -1,13 +1,13 @@ /*- * BSD LICENSE - * + * * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. * All rights reserved. - * + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: - * + * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright @@ -17,7 +17,7 @@ * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. - * + * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR @@ -85,7 +85,6 @@ #include #include #include -#include #include #include #include @@ -98,6 +97,13 @@ #include "eal_filesystem.h" #include "eal_hugepages.h" +#ifdef RTE_LIBRTE_XEN_DOM0 +int rte_xen_dom0_supported(void) +{ + return internal_config.xen_dom0_support; +} +#endif + /** * @file * Huge page mapping under linux @@ -112,8 +118,28 @@ static uint64_t baseaddr_offset; +static unsigned proc_pagemap_readable; + #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space" +static void +test_proc_pagemap_readable(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) { + RTE_LOG(ERR, EAL, + "Cannot open /proc/self/pagemap: %s. " + "virt2phys address translation will not work\n", + strerror(errno)); + return; + } + + /* Is readable */ + close(fd); + proc_pagemap_readable = 1; +} + /* Lock page in physical memory and prevent from swapping. */ int rte_mem_lock_page(const void *virt) @@ -136,6 +162,10 @@ rte_mem_virt2phy(const void *virtaddr) int page_size; off_t offset; + /* Cannot parse /proc/self/pagemap, no need to log errors everywhere */ + if (!proc_pagemap_readable) + return RTE_BAD_PHYS_ADDR; + /* standard page size */ page_size = getpagesize(); @@ -240,7 +270,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz) } else addr = NULL; - RTE_LOG(INFO, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); + RTE_LOG(DEBUG, EAL, "Ask a virtual area of 0x%zx bytes\n", *size); fd = open("/dev/zero", O_RDONLY); if (fd < 0){ @@ -256,7 +286,8 @@ get_virtual_area(size_t *size, size_t hugepage_sz) if (addr == MAP_FAILED) { close(fd); - RTE_LOG(INFO, EAL, "Cannot get a virtual area\n"); + RTE_LOG(ERR, EAL, "Cannot get a virtual area: %s\n", + strerror(errno)); return NULL; } @@ -269,7 +300,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz) aligned_addr &= (~(hugepage_sz - 1)); addr = (void *)(aligned_addr); - RTE_LOG(INFO, EAL, "Virtual area found at %p (size = 0x%zx)\n", + RTE_LOG(DEBUG, EAL, "Virtual area found at %p (size = 0x%zx)\n", addr, *size); /* increment offset */ @@ -300,7 +331,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, #endif for (i = 0; i < hpi->num_pages[0]; i++) { - size_t hugepage_sz = hpi->hugepage_sz; + uint64_t hugepage_sz = hpi->hugepage_sz; if (orig) { hugepg_tbl[i].file_id = i; @@ -316,11 +347,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, #endif hugepg_tbl[i].filepath[sizeof(hugepg_tbl[i].filepath) - 1] = '\0'; } -#ifndef RTE_ARCH_X86_64 - /* for 32-bit systems, don't remap 1G pages, just reuse original - * map address as final map address. +#ifndef RTE_ARCH_64 + /* for 32-bit systems, don't remap 1G and 16G pages, just reuse + * original map address as final map address. */ - else if (hugepage_sz == RTE_PGSIZE_1G){ + else if ((hugepage_sz == RTE_PGSIZE_1G) + || (hugepage_sz == RTE_PGSIZE_16G)) { hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; hugepg_tbl[i].orig_va = NULL; continue; @@ -335,9 +367,17 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, * physical block: count the number of * contiguous physical pages. */ for (j = i+1; j < hpi->num_pages[0] ; j++) { +#ifdef RTE_ARCH_PPC_64 + /* The physical addresses are sorted in + * descending order on PPC64 */ + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr - hugepage_sz) + break; +#else if (hugepg_tbl[j].physaddr != hugepg_tbl[j-1].physaddr + hugepage_sz) break; +#endif } num_pages = j - i; vma_len = num_pages * hugepage_sz; @@ -359,8 +399,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, return -1; } + /* map the segment, and populate page tables, + * the kernel fills this segment with zeros */ virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); + MAP_SHARED | MAP_POPULATE, fd, 0); if (virtaddr == MAP_FAILED) { RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); @@ -370,7 +412,6 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, if (orig) { hugepg_tbl[i].orig_va = virtaddr; - memset(virtaddr, 0, hugepage_sz); } else { hugepg_tbl[i].final_va = virtaddr; @@ -412,11 +453,12 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) while (i < hpi->num_pages[0]) { -#ifndef RTE_ARCH_X86_64 - /* for 32-bit systems, don't remap 1G pages, just reuse original - * map address as final map address. +#ifndef RTE_ARCH_64 + /* for 32-bit systems, don't remap 1G pages and 16G pages, + * just reuse original map address as final map address. */ - if (hugepage_sz == RTE_PGSIZE_1G){ + if ((hugepage_sz == RTE_PGSIZE_1G) + || (hugepage_sz == RTE_PGSIZE_16G)) { hugepg_tbl[i].final_va = hugepg_tbl[i].orig_va; hugepg_tbl[i].orig_va = NULL; i++; @@ -428,8 +470,17 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) * physical block: count the number of * contiguous physical pages. */ for (j = i+1; j < hpi->num_pages[0] ; j++) { - if (hugepg_tbl[j].physaddr != hugepg_tbl[j-1].physaddr + hugepage_sz) +#ifdef RTE_ARCH_PPC_64 + /* The physical addresses are sorted in descending + * order on PPC64 */ + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr - hugepage_sz) break; +#else + if (hugepg_tbl[j].physaddr != + hugepg_tbl[j-1].physaddr + hugepage_sz) + break; +#endif } num_pages = j - i; vma_len = num_pages * hugepage_sz; @@ -479,22 +530,16 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) old_addr = vma_addr; - /* map new, bigger segment */ + /* map new, bigger segment, and populate page tables, + * the kernel fills this segment with zeros */ vma_addr = mmap(vma_addr, total_size, - PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (vma_addr == MAP_FAILED || vma_addr != old_addr) { RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__, strerror(errno)); close(fd); return -1; } - - /* touch the page. this is needed because kernel postpones mapping - * creation until the first page fault. with this, we pin down - * the page and it is marked as used and gets into process' pagemap. - */ - for (offset = 0; offset < total_size; offset += hugepage_sz) - *((volatile uint8_t*) RTE_PTR_ADD(vma_addr, offset)); } /* set shared flock on the file. */ @@ -505,7 +550,7 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) return -1; } - rte_snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s", + snprintf(hugepg_tbl[page_idx].filepath, MAX_HUGEPAGE_PATH, "%s", filepath); physaddr = rte_mem_virt2phy(vma_addr); @@ -542,9 +587,6 @@ remap_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) } } - /* zero out the whole segment */ - memset(hugepg_tbl[page_idx].final_va, 0, total_size); - page_idx++; } @@ -586,13 +628,13 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) f = fopen("/proc/self/numa_maps", "r"); if (f == NULL) { - RTE_LOG(INFO, EAL, "cannot open /proc/self/numa_maps," + RTE_LOG(NOTICE, EAL, "cannot open /proc/self/numa_maps," " consider that all memory is in socket_id 0\n"); return 0; } - rte_snprintf(hugedir_str, sizeof(hugedir_str), - "%s/", hpi->hugedir); + snprintf(hugedir_str, sizeof(hugedir_str), + "%s/%s", hpi->hugedir, internal_config.hugefile_prefix); /* parse numa map */ while (fgets(buf, sizeof(buf), f) != NULL) { @@ -651,49 +693,23 @@ error: return -1; } -/* - * Sort the hugepg_tbl by physical address (lower addresses first). We - * use a slow algorithm, but we won't have millions of pages, and this - * is only done at init time. - */ static int -sort_by_physaddr(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi) +cmp_physaddr(const void *a, const void *b) { - unsigned i, j; - int smallest_idx; - uint64_t smallest_addr; - struct hugepage_file tmp; - - for (i = 0; i < hpi->num_pages[0]; i++) { - smallest_addr = 0; - smallest_idx = -1; - - /* - * browse all entries starting at 'i', and find the - * entry with the smallest addr - */ - for (j=i; j< hpi->num_pages[0]; j++) { - - if (smallest_addr == 0 || - hugepg_tbl[j].physaddr < smallest_addr) { - smallest_addr = hugepg_tbl[j].physaddr; - smallest_idx = j; - } - } - - /* should not happen */ - if (smallest_idx == -1) { - RTE_LOG(ERR, EAL, "%s(): error in physaddr sorting\n", __func__); - return -1; - } - - /* swap the 2 entries in the table */ - memcpy(&tmp, &hugepg_tbl[smallest_idx], sizeof(struct hugepage_file)); - memcpy(&hugepg_tbl[smallest_idx], &hugepg_tbl[i], - sizeof(struct hugepage_file)); - memcpy(&hugepg_tbl[i], &tmp, sizeof(struct hugepage_file)); - } - return 0; +#ifndef RTE_ARCH_PPC_64 + const struct hugepage_file *p1 = (const struct hugepage_file *)a; + const struct hugepage_file *p2 = (const struct hugepage_file *)b; +#else + /* PowerPC needs memory sorted in reverse order from x86 */ + const struct hugepage_file *p1 = (const struct hugepage_file *)b; + const struct hugepage_file *p2 = (const struct hugepage_file *)a; +#endif + if (p1->physaddr < p2->physaddr) + return -1; + else if (p1->physaddr > p2->physaddr) + return 1; + else + return 0; } /* @@ -738,6 +754,30 @@ copy_hugepages_to_shared_mem(struct hugepage_file * dst, int dest_size, return 0; } +static int +unlink_hugepage_files(struct hugepage_file *hugepg_tbl, + unsigned num_hp_info) +{ + unsigned socket, size; + int page, nrpages = 0; + + /* get total number of hugepages */ + for (size = 0; size < num_hp_info; size++) + for (socket = 0; socket < RTE_MAX_NUMA_NODES; socket++) + nrpages += + internal_config.hugepage_info[size].num_pages[socket]; + + for (page = 0; page < nrpages; page++) { + struct hugepage_file *hp = &hugepg_tbl[page]; + + if (hp->final_va != NULL && unlink(hp->filepath)) { + RTE_LOG(WARNING, EAL, "%s(): Removing %s failed: %s\n", + __func__, hp->filepath, strerror(errno)); + } + } + return 0; +} + /* * unmaps hugepages that are not going to be used. since we originally allocate * ALL hugepages (not just those we need), additional unmapping needs to be done. @@ -857,7 +897,7 @@ get_socket_mem_size(int socket) size += hpi->hugepage_sz * hpi->num_pages[socket]; } - return (size); + return size; } /* @@ -978,7 +1018,7 @@ calc_num_pages_per_socket(uint64_t * memory, 0x100000); available = requested - ((unsigned) (memory[socket] / 0x100000)); - RTE_LOG(INFO, EAL, "Not enough memory available on socket %u! " + RTE_LOG(ERR, EAL, "Not enough memory available on socket %u! " "Requested: %uMB, available: %uMB\n", socket, requested, available); return -1; @@ -989,7 +1029,7 @@ calc_num_pages_per_socket(uint64_t * memory, if (total_mem > 0) { requested = (unsigned) (internal_config.memory / 0x100000); available = requested - (unsigned) (total_mem / 0x100000); - RTE_LOG(INFO, EAL, "Not enough memory available! Requested: %uMB," + RTE_LOG(ERR, EAL, "Not enough memory available! Requested: %uMB," " available: %uMB\n", requested, available); return -1; } @@ -1007,7 +1047,7 @@ calc_num_pages_per_socket(uint64_t * memory, * 6. unmap the first mapping * 7. fill memsegs in configuration with contiguous zones */ -static int +int rte_eal_hugepage_init(void) { struct rte_mem_config *mcfg; @@ -1024,6 +1064,8 @@ rte_eal_hugepage_init(void) int new_pages_count[MAX_HUGEPAGE_SIZES]; #endif + test_proc_pagemap_readable(); + memset(used_hp, 0, sizeof(used_hp)); /* get pointer to global configuration */ @@ -1031,11 +1073,18 @@ rte_eal_hugepage_init(void) /* hugetlbfs can be disabled */ if (internal_config.no_hugetlbfs) { - addr = malloc(internal_config.memory); + addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, + MAP_LOCKED | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + if (addr == MAP_FAILED) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); + return -1; + } mcfg->memseg[0].phys_addr = (phys_addr_t)(uintptr_t)addr; mcfg->memseg[0].addr = addr; + mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K; mcfg->memseg[0].len = internal_config.memory; - mcfg->memseg[0].socket_id = SOCKET_ID_ANY; + mcfg->memseg[0].socket_id = 0; return 0; } @@ -1050,7 +1099,6 @@ rte_eal_hugepage_init(void) #endif } - /* calculate total number of hugepages available. at this point we haven't * yet started sorting them so they all are on socket 0 */ for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { @@ -1108,8 +1156,8 @@ rte_eal_hugepage_init(void) goto fail; } - if (sort_by_physaddr(&tmp_hp[hp_offset], hpi) < 0) - goto fail; + qsort(&tmp_hp[hp_offset], hpi->num_pages[0], + sizeof(struct hugepage_file), cmp_physaddr); #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS /* remap all hugepages into single file segments */ @@ -1159,7 +1207,9 @@ rte_eal_hugepage_init(void) int socket = tmp_hp[i].socket_id; /* find a hugepage info with right size and increment num_pages */ - for (j = 0; j < (int) internal_config.num_hugepage_sizes; j++) { + const int nb_hpsizes = RTE_MIN(MAX_HUGEPAGE_SIZES, + (int)internal_config.num_hugepage_sizes); + for (j = 0; j < nb_hpsizes; j++) { if (tmp_hp[i].size == internal_config.hugepage_info[j].hugepage_sz) { #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS @@ -1189,13 +1239,13 @@ rte_eal_hugepage_init(void) for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { for (j = 0; j < RTE_MAX_NUMA_NODES; j++) { if (used_hp[i].num_pages[j] > 0) { - RTE_LOG(INFO, EAL, - "Requesting %u pages of size %uMB" - " from socket %i\n", - used_hp[i].num_pages[j], - (unsigned) - (used_hp[i].hugepage_sz / 0x100000), - j); + RTE_LOG(DEBUG, EAL, + "Requesting %u pages of size %uMB" + " from socket %i\n", + used_hp[i].num_pages[j], + (unsigned) + (used_hp[i].hugepage_sz / 0x100000), + j); } } } @@ -1231,6 +1281,13 @@ rte_eal_hugepage_init(void) goto fail; } + /* free the hugepage backing files */ + if (internal_config.hugepage_unlink && + unlink_hugepage_files(tmp_hp, internal_config.num_hugepage_sizes) < 0) { + RTE_LOG(ERR, EAL, "Unlinking hugepage files failed!\n"); + goto fail; + } + /* free the temporary hugepage table */ free(tmp_hp); tmp_hp = NULL; @@ -1254,12 +1311,25 @@ rte_eal_hugepage_init(void) new_memseg = 1; else if (hugepage[i].size != hugepage[i-1].size) new_memseg = 1; + +#ifdef RTE_ARCH_PPC_64 + /* On PPC64 architecture, the mmap always start from higher + * virtual address to lower address. Here, both the physical + * address and virtual address are in descending order */ + else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) != + hugepage[i].size) + new_memseg = 1; + else if (((unsigned long)hugepage[i-1].final_va - + (unsigned long)hugepage[i].final_va) != hugepage[i].size) + new_memseg = 1; +#else else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) != hugepage[i].size) new_memseg = 1; else if (((unsigned long)hugepage[i].final_va - (unsigned long)hugepage[i-1].final_va) != hugepage[i].size) new_memseg = 1; +#endif if (new_memseg) { j += 1; @@ -1278,6 +1348,12 @@ rte_eal_hugepage_init(void) } /* continuation of previous memseg */ else { +#ifdef RTE_ARCH_PPC_64 + /* Use the phy and virt address of the last page as segment + * address for IBM Power architecture */ + mcfg->memseg[j].phys_addr = hugepage[i].physaddr; + mcfg->memseg[j].addr = hugepage[i].final_va; +#endif mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz; } hugepage[i].memseg_id = j; @@ -1291,14 +1367,13 @@ rte_eal_hugepage_init(void) "of memory.\n", i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG), RTE_MAX_MEMSEG); - return (-ENOMEM); + return -ENOMEM; } return 0; fail: - if (tmp_hp) - free(tmp_hp); + free(tmp_hp); return -1; } @@ -1320,7 +1395,7 @@ getFileSize(int fd) * configuration and finds the hugepages which form that segment, mapping them * in order to form a contiguous block in the virtual memory space */ -static int +int rte_eal_hugepage_attach(void) { const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config; @@ -1337,6 +1412,8 @@ rte_eal_hugepage_attach(void) "into secondary processes\n"); } + test_proc_pagemap_readable(); + if (internal_config.xen_dom0_support) { #ifdef RTE_LIBRTE_XEN_DOM0 if (rte_xen_dom0_memory_attach() < 0) { @@ -1480,36 +1557,3 @@ error: close(fd_hugepage); return -1; } - -static int -rte_eal_memdevice_init(void) -{ - struct rte_config *config; - - if (rte_eal_process_type() == RTE_PROC_SECONDARY) - return 0; - - config = rte_eal_get_configuration(); - config->mem_config->nchannel = internal_config.force_nchannel; - config->mem_config->nrank = internal_config.force_nrank; - - return 0; -} - - -/* init memory subsystem */ -int -rte_eal_memory_init(void) -{ - RTE_LOG(INFO, EAL, "Setting up memory...\n"); - const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ? - rte_eal_hugepage_init() : - rte_eal_hugepage_attach(); - if (retval < 0) - return -1; - - if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0) - return -1; - - return 0; -}