[dpdk-dev] eal/ppc: fix secondary process to map hugepages in correct order

Message ID 1457360003-30055-1-git-send-email-gowrishankar.m@linux.vnet.ibm.com (mailing list archive)
State Rejected, archived
Delegated to: Thomas Monjalon
Headers

Commit Message

Gowrishankar March 7, 2016, 2:13 p.m. UTC
  From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>

For a secondary process address space to map hugepages from every segment of
primary process, hugepage_file entries has to be mapped reversely from the
list that primary process updated for every segment. This is for a reason that,
in ppc64, hugepages are sorted for decrementing addresses.

Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
---
 lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)
  

Comments

Gowrishankar March 17, 2016, 5:05 a.m. UTC | #1
Could this patch be reviewed please.

Thanks,
Gowrishankar

On Monday 07 March 2016 07:43 PM, Gowrishankar wrote:
> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>
> For a secondary process address space to map hugepages from every segment of
> primary process, hugepage_file entries has to be mapped reversely from the
> list that primary process updated for every segment. This is for a reason that,
> in ppc64, hugepages are sorted for decrementing addresses.
>
> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
>   1 file changed, 16 insertions(+), 10 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..6aea5d0 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -1400,7 +1400,7 @@ rte_eal_hugepage_attach(void)
>   {
>   	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>   	const struct hugepage_file *hp = NULL;
> -	unsigned num_hp = 0;
> +	unsigned num_hp = 0, mapped_hp = 0;
>   	unsigned i, s = 0; /* s used to track the segment number */
>   	off_t size;
>   	int fd, fd_zero = -1, fd_hugepage = -1;
> @@ -1486,14 +1486,12 @@ rte_eal_hugepage_attach(void)
>   		goto error;
>   	}
>
> -	num_hp = size / sizeof(struct hugepage_file);
> -	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
> -
>   	s = 0;
>   	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
>   		void *addr, *base_addr;
>   		uintptr_t offset = 0;
>   		size_t mapping_size;
> +		unsigned int index;
>   #ifdef RTE_LIBRTE_IVSHMEM
>   		/*
>   		 * if segment has ioremap address set, it's an IVSHMEM segment and
> @@ -1504,6 +1502,8 @@ rte_eal_hugepage_attach(void)
>   			continue;
>   		}
>   #endif
> +		num_hp = mcfg->memseg[s].len / mcfg->memseg[s].hugepage_sz;
> +		RTE_LOG(DEBUG, EAL, "Analysing %u files in segment %u\n", num_hp, s);
>   		/*
>   		 * free previously mapped memory so we can map the
>   		 * hugepages into the space
> @@ -1514,18 +1514,23 @@ rte_eal_hugepage_attach(void)
>   		/* find the hugepages for this segment and map them
>   		 * we don't need to worry about order, as the server sorted the
>   		 * entries before it did the second mmap of them */
> +#ifdef RTE_ARCH_PPC_64
> +		for (i = num_hp-1; i < num_hp && offset < mcfg->memseg[s].len; i--){
> +#else
>   		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
> -			if (hp[i].memseg_id == (int)s){
> -				fd = open(hp[i].filepath, O_RDWR);
> +#endif
> +			index = i + mapped_hp;
> +			if (hp[index].memseg_id == (int)s){
> +				fd = open(hp[index].filepath, O_RDWR);
>   				if (fd < 0) {
>   					RTE_LOG(ERR, EAL, "Could not open %s\n",
> -						hp[i].filepath);
> +						hp[index].filepath);
>   					goto error;
>   				}
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> -				mapping_size = hp[i].size * hp[i].repeated;
> +				mapping_size = hp[index].size * hp[index].repeated;
>   #else
> -				mapping_size = hp[i].size;
> +				mapping_size = hp[index].size;
>   #endif
>   				addr = mmap(RTE_PTR_ADD(base_addr, offset),
>   						mapping_size, PROT_READ | PROT_WRITE,
> @@ -1534,7 +1539,7 @@ rte_eal_hugepage_attach(void)
>   				if (addr == MAP_FAILED ||
>   						addr != RTE_PTR_ADD(base_addr, offset)) {
>   					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
> -						hp[i].filepath);
> +						hp[index].filepath);
>   					goto error;
>   				}
>   				offset+=mapping_size;
> @@ -1543,6 +1548,7 @@ rte_eal_hugepage_attach(void)
>   		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
>   				(unsigned long long)mcfg->memseg[s].len);
>   		s++;
> +		mapped_hp += num_hp;
>   	}
>   	/* unmap the hugepage config file, since we are done using it */
>   	munmap((void *)(uintptr_t)hp, size);
  
Thomas Monjalon March 22, 2016, 11:36 a.m. UTC | #2
Sergio, your help is required here.
Thanks

2016-03-17 10:35, gowrishankar:
> Could this patch be reviewed please.
> 
> Thanks,
> Gowrishankar
> 
> On Monday 07 March 2016 07:43 PM, Gowrishankar wrote:
> > From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
> >
> > For a secondary process address space to map hugepages from every segment of
> > primary process, hugepage_file entries has to be mapped reversely from the
> > list that primary process updated for every segment. This is for a reason that,
> > in ppc64, hugepages are sorted for decrementing addresses.
> >
> > Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> > ---
> >   lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
> >   1 file changed, 16 insertions(+), 10 deletions(-)
  
Sergio Gonzalez Monroy March 22, 2016, 12:11 p.m. UTC | #3
On 22/03/2016 11:36, Thomas Monjalon wrote:
> Sergio, your help is required here.

I missed it with the /ppc tag.
I'll get to it.

Sergio

> Thanks
>
> 2016-03-17 10:35, gowrishankar:
>> Could this patch be reviewed please.
>>
>> Thanks,
>> Gowrishankar
>>
>> On Monday 07 March 2016 07:43 PM, Gowrishankar wrote:
>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>>>
>>> For a secondary process address space to map hugepages from every segment of
>>> primary process, hugepage_file entries has to be mapped reversely from the
>>> list that primary process updated for every segment. This is for a reason that,
>>> in ppc64, hugepages are sorted for decrementing addresses.
>>>
>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
>>> ---
>>>    lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
>>>    1 file changed, 16 insertions(+), 10 deletions(-)
>
  
Sergio Gonzalez Monroy March 22, 2016, 4:35 p.m. UTC | #4
First of all, forgive my ignorance regarding ppc64 and if the questions 
are naive but after having a
look to the already existing code for ppc64 and this patch now, why are 
we doing this reverse mapping at all?

I guess the question revolves around the comment in eal_memory.c:
1316                 /* On PPC64 architecture, the mmap always start 
from higher
1317                  * virtual address to lower address. Here, both the 
physical
1318                  * address and virtual address are in descending 
order */

 From looking at the code, for ppc64 we do qsort in reverse order and 
thereafter everything looks to be is
done to account for that reverse sorting.

CC: Chao Zhu and David Marchand as original author and reviewer of the code.

Sergio


On 07/03/2016 14:13, Gowrishankar wrote:
> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>
> For a secondary process address space to map hugepages from every segment of
> primary process, hugepage_file entries has to be mapped reversely from the
> list that primary process updated for every segment. This is for a reason that,
> in ppc64, hugepages are sorted for decrementing addresses.
>
> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
>   1 file changed, 16 insertions(+), 10 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..6aea5d0 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -1400,7 +1400,7 @@ rte_eal_hugepage_attach(void)
>   {
>   	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
>   	const struct hugepage_file *hp = NULL;
> -	unsigned num_hp = 0;
> +	unsigned num_hp = 0, mapped_hp = 0;
>   	unsigned i, s = 0; /* s used to track the segment number */
>   	off_t size;
>   	int fd, fd_zero = -1, fd_hugepage = -1;
> @@ -1486,14 +1486,12 @@ rte_eal_hugepage_attach(void)
>   		goto error;
>   	}
>   
> -	num_hp = size / sizeof(struct hugepage_file);
> -	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
> -
>   	s = 0;
>   	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
>   		void *addr, *base_addr;
>   		uintptr_t offset = 0;
>   		size_t mapping_size;
> +		unsigned int index;
>   #ifdef RTE_LIBRTE_IVSHMEM
>   		/*
>   		 * if segment has ioremap address set, it's an IVSHMEM segment and
> @@ -1504,6 +1502,8 @@ rte_eal_hugepage_attach(void)
>   			continue;
>   		}
>   #endif
> +		num_hp = mcfg->memseg[s].len / mcfg->memseg[s].hugepage_sz;
> +		RTE_LOG(DEBUG, EAL, "Analysing %u files in segment %u\n", num_hp, s);
>   		/*
>   		 * free previously mapped memory so we can map the
>   		 * hugepages into the space
> @@ -1514,18 +1514,23 @@ rte_eal_hugepage_attach(void)
>   		/* find the hugepages for this segment and map them
>   		 * we don't need to worry about order, as the server sorted the
>   		 * entries before it did the second mmap of them */
> +#ifdef RTE_ARCH_PPC_64
> +		for (i = num_hp-1; i < num_hp && offset < mcfg->memseg[s].len; i--){
> +#else
>   		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
> -			if (hp[i].memseg_id == (int)s){
> -				fd = open(hp[i].filepath, O_RDWR);
> +#endif
> +			index = i + mapped_hp;
> +			if (hp[index].memseg_id == (int)s){
> +				fd = open(hp[index].filepath, O_RDWR);
>   				if (fd < 0) {
>   					RTE_LOG(ERR, EAL, "Could not open %s\n",
> -						hp[i].filepath);
> +						hp[index].filepath);
>   					goto error;
>   				}
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> -				mapping_size = hp[i].size * hp[i].repeated;
> +				mapping_size = hp[index].size * hp[index].repeated;
>   #else
> -				mapping_size = hp[i].size;
> +				mapping_size = hp[index].size;
>   #endif
>   				addr = mmap(RTE_PTR_ADD(base_addr, offset),
>   						mapping_size, PROT_READ | PROT_WRITE,
> @@ -1534,7 +1539,7 @@ rte_eal_hugepage_attach(void)
>   				if (addr == MAP_FAILED ||
>   						addr != RTE_PTR_ADD(base_addr, offset)) {
>   					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
> -						hp[i].filepath);
> +						hp[index].filepath);
>   					goto error;
>   				}
>   				offset+=mapping_size;
> @@ -1543,6 +1548,7 @@ rte_eal_hugepage_attach(void)
>   		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
>   				(unsigned long long)mcfg->memseg[s].len);
>   		s++;
> +		mapped_hp += num_hp;
>   	}
>   	/* unmap the hugepage config file, since we are done using it */
>   	munmap((void *)(uintptr_t)hp, size);
  
Bruce Richardson March 22, 2016, 5:10 p.m. UTC | #5
On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
> First of all, forgive my ignorance regarding ppc64 and if the questions are
> naive but after having a
> look to the already existing code for ppc64 and this patch now, why are we
> doing this reverse mapping at all?
> 
> I guess the question revolves around the comment in eal_memory.c:
> 1316                 /* On PPC64 architecture, the mmap always start from
> higher
> 1317                  * virtual address to lower address. Here, both the
> physical
> 1318                  * address and virtual address are in descending order
> */
> 
> From looking at the code, for ppc64 we do qsort in reverse order and
> thereafter everything looks to be is
> done to account for that reverse sorting.
> 
> CC: Chao Zhu and David Marchand as original author and reviewer of the code.
> 
> Sergio
>

Just to add my 2c here. At one point, with I believe some i686 installs - don't
remember the specific OS/kernel, we found that the mmap calls were returning
the highest free address first and then working downwards - must like seems
to be described here. To fix this we changed the mmap code from assuming that
addresses are mapped upwards, to instead explicitly requesting a large free
block of memory (mmap of /dev/zero) to find a free address space
range of the correct size, and then explicitly mmapping each individual page to
the appropriate place in that free range. With this scheme it didn't matter whether
the OS tried to mmap the pages from the highest or lowest address because we
always told the OS where to put the page (and we knew the slot was free from
the earlier block mmap).
Would this scheme not also work for PPC in a similar way? (Again, forgive
unfamiliarity with PPC! :-) )

/Bruce

> 
> On 07/03/2016 14:13, Gowrishankar wrote:
> >From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
> >
> >For a secondary process address space to map hugepages from every segment of
> >primary process, hugepage_file entries has to be mapped reversely from the
> >list that primary process updated for every segment. This is for a reason that,
> >in ppc64, hugepages are sorted for decrementing addresses.
> >
> >Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> >---
> >  lib/librte_eal/linuxapp/eal/eal_memory.c |   26 ++++++++++++++++----------
> >  1 file changed, 16 insertions(+), 10 deletions(-)
> >
> >diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >index 5b9132c..6aea5d0 100644
> >--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> >+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >@@ -1400,7 +1400,7 @@ rte_eal_hugepage_attach(void)
> >  {
> >  	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
> >  	const struct hugepage_file *hp = NULL;
> >-	unsigned num_hp = 0;
> >+	unsigned num_hp = 0, mapped_hp = 0;
> >  	unsigned i, s = 0; /* s used to track the segment number */
> >  	off_t size;
> >  	int fd, fd_zero = -1, fd_hugepage = -1;
> >@@ -1486,14 +1486,12 @@ rte_eal_hugepage_attach(void)
> >  		goto error;
> >  	}
> >-	num_hp = size / sizeof(struct hugepage_file);
> >-	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
> >-
> >  	s = 0;
> >  	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
> >  		void *addr, *base_addr;
> >  		uintptr_t offset = 0;
> >  		size_t mapping_size;
> >+		unsigned int index;
> >  #ifdef RTE_LIBRTE_IVSHMEM
> >  		/*
> >  		 * if segment has ioremap address set, it's an IVSHMEM segment and
> >@@ -1504,6 +1502,8 @@ rte_eal_hugepage_attach(void)
> >  			continue;
> >  		}
> >  #endif
> >+		num_hp = mcfg->memseg[s].len / mcfg->memseg[s].hugepage_sz;
> >+		RTE_LOG(DEBUG, EAL, "Analysing %u files in segment %u\n", num_hp, s);
> >  		/*
> >  		 * free previously mapped memory so we can map the
> >  		 * hugepages into the space
> >@@ -1514,18 +1514,23 @@ rte_eal_hugepage_attach(void)
> >  		/* find the hugepages for this segment and map them
> >  		 * we don't need to worry about order, as the server sorted the
> >  		 * entries before it did the second mmap of them */
> >+#ifdef RTE_ARCH_PPC_64
> >+		for (i = num_hp-1; i < num_hp && offset < mcfg->memseg[s].len; i--){
> >+#else
> >  		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
> >-			if (hp[i].memseg_id == (int)s){
> >-				fd = open(hp[i].filepath, O_RDWR);
> >+#endif
> >+			index = i + mapped_hp;
> >+			if (hp[index].memseg_id == (int)s){
> >+				fd = open(hp[index].filepath, O_RDWR);
> >  				if (fd < 0) {
> >  					RTE_LOG(ERR, EAL, "Could not open %s\n",
> >-						hp[i].filepath);
> >+						hp[index].filepath);
> >  					goto error;
> >  				}
> >  #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> >-				mapping_size = hp[i].size * hp[i].repeated;
> >+				mapping_size = hp[index].size * hp[index].repeated;
> >  #else
> >-				mapping_size = hp[i].size;
> >+				mapping_size = hp[index].size;
> >  #endif
> >  				addr = mmap(RTE_PTR_ADD(base_addr, offset),
> >  						mapping_size, PROT_READ | PROT_WRITE,
> >@@ -1534,7 +1539,7 @@ rte_eal_hugepage_attach(void)
> >  				if (addr == MAP_FAILED ||
> >  						addr != RTE_PTR_ADD(base_addr, offset)) {
> >  					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
> >-						hp[i].filepath);
> >+						hp[index].filepath);
> >  					goto error;
> >  				}
> >  				offset+=mapping_size;
> >@@ -1543,6 +1548,7 @@ rte_eal_hugepage_attach(void)
> >  		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
> >  				(unsigned long long)mcfg->memseg[s].len);
> >  		s++;
> >+		mapped_hp += num_hp;
> >  	}
> >  	/* unmap the hugepage config file, since we are done using it */
> >  	munmap((void *)(uintptr_t)hp, size);
>
  
Chao Zhu May 20, 2016, 3:03 a.m. UTC | #6
Bruce,

Recently, we find some bugs with mmap in PowerLinux. The mmap doesn't
respect the address hints. In function get_virtual_area() in eal_memory.c,
mmap get the free virtual address range as the address hint. However, when
mapping the real memory in rte_eal_hugepage_init(), mmap doesn't return the
same address as the requested address. When taking a look at the
/proc/<pid>/maps, the requested address range is free for use. With this
bug, pre-allocate some free space doesn't work.

We're trying to create some test case and report it as a bug to kernel
community.

Here's some logs:
===============================
EAL: Ask a virtual area of 0x10000000 bytes
EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000  requested
addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000  requested
addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000  requested
addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000  requested
addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000  requested
addr: 0x3fffab000000  mmaped addr: 0x3effec000000
EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000  requested
addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000  requested
addr: 0x3fffad000000  mmaped addr: 0x3effea000000
EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000  requested
addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000  requested
addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000  requested
addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000 requested
addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000 requested
addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000 requested
addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000 requested
addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000 requested
addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000 requested
addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000

# cat /proc/143765/maps
01000000-02000000 rw-s 00000000 00:27 61162550
/mnt/huge/rtemap_14
02000000-03000000 rw-s 00000000 00:27 61162536
/mnt/huge/rtemap_0
03000000-04000000 rw-s 00000000 00:27 61162537
/mnt/huge/rtemap_1
04000000-05000000 rw-s 00000000 00:27 61162538
/mnt/huge/rtemap_2
05000000-06000000 rw-s 00000000 00:27 61162539
/mnt/huge/rtemap_3
06000000-07000000 rw-s 00000000 00:27 61162540
/mnt/huge/rtemap_4
07000000-08000000 rw-s 00000000 00:27 61162541
/mnt/huge/rtemap_5
08000000-09000000 rw-s 00000000 00:27 61162542
/mnt/huge/rtemap_6
09000000-0a000000 rw-s 00000000 00:27 61162543
/mnt/huge/rtemap_7
0a000000-0b000000 rw-s 00000000 00:27 61162544
/mnt/huge/rtemap_8
0b000000-0c000000 rw-s 00000000 00:27 61162545
/mnt/huge/rtemap_9
0c000000-0d000000 rw-s 00000000 00:27 61162546
/mnt/huge/rtemap_10
0d000000-0e000000 rw-s 00000000 00:27 61162547
/mnt/huge/rtemap_11
0e000000-0f000000 rw-s 00000000 00:27 61162548
/mnt/huge/rtemap_12
0f000000-10000000 rw-s 00000000 00:27 61162549
/mnt/huge/rtemap_13
10000000-101f0000 r-xp 00000000 08:32 6040458
/home/dpdk/build/app/test
101f0000-10220000 rw-p 001f0000 08:32 6040458
/home/dpdk/build/app/test
10220000-15c20000 rw-p 00000000 00:00 0
[heap]
20000000-21000000 rw-s 00000000 00:27 61162566
/mnt/huge/rtemap_30
21000000-22000000 rw-s 00000000 00:27 61162567
/mnt/huge/rtemap_31
22000000-23000000 rw-s 00000000 00:27 61162568
/mnt/huge/rtemap_32
23000000-24000000 rw-s 00000000 00:27 61162569
/mnt/huge/rtemap_33
24000000-25000000 rw-s 00000000 00:27 61162570
/mnt/huge/rtemap_34
25000000-26000000 rw-s 00000000 00:27 61162571
/mnt/huge/rtemap_35
26000000-27000000 rw-s 00000000 00:27 61162572
/mnt/huge/rtemap_36
27000000-28000000 rw-s 00000000 00:27 61162573
/mnt/huge/rtemap_37
28000000-29000000 rw-s 00000000 00:27 61162574
/mnt/huge/rtemap_38
29000000-2a000000 rw-s 00000000 00:27 61162575
/mnt/huge/rtemap_39
2a000000-2b000000 rw-s 00000000 00:27 61162576
/mnt/huge/rtemap_40
2b000000-2c000000 rw-s 00000000 00:27 61162577
/mnt/huge/rtemap_41
2c000000-2d000000 rw-s 00000000 00:27 61162578
/mnt/huge/rtemap_42
2d000000-2e000000 rw-s 00000000 00:27 61162579
/mnt/huge/rtemap_43
2e000000-2f000000 rw-s 00000000 00:27 61162580
/mnt/huge/rtemap_44
2f000000-30000000 rw-s 00000000 00:27 61162581
/mnt/huge/rtemap_45
30000000-31000000 rw-s 00000000 00:27 61162582
/mnt/huge/rtemap_46
31000000-32000000 rw-s 00000000 00:27 61162583
/mnt/huge/rtemap_47
32000000-33000000 rw-s 00000000 00:27 61162584
/mnt/huge/rtemap_48
33000000-34000000 rw-s 00000000 00:27 61162585
/mnt/huge/rtemap_49
34000000-35000000 rw-s 00000000 00:27 61162586
/mnt/huge/rtemap_50
35000000-36000000 rw-s 00000000 00:27 61162587
/mnt/huge/rtemap_51
36000000-37000000 rw-s 00000000 00:27 61162588
/mnt/huge/rtemap_52
37000000-38000000 rw-s 00000000 00:27 61162589
/mnt/huge/rtemap_53
38000000-39000000 rw-s 00000000 00:27 61162590
/mnt/huge/rtemap_54
39000000-3a000000 rw-s 00000000 00:27 61162591
/mnt/huge/rtemap_55
3a000000-3b000000 rw-s 00000000 00:27 61162592
/mnt/huge/rtemap_56
3b000000-3c000000 rw-s 00000000 00:27 61162593
/mnt/huge/rtemap_57
3c000000-3d000000 rw-s 00000000 00:27 61162594
/mnt/huge/rtemap_58
3d000000-3e000000 rw-s 00000000 00:27 61162595
/mnt/huge/rtemap_59
3e000000-3f000000 rw-s 00000000 00:27 61162596
/mnt/huge/rtemap_60
3f000000-40000000 rw-s 00000000 00:27 61162597
/mnt/huge/rtemap_61
40000000-41000000 rw-s 00000000 00:27 61162598
/mnt/huge/rtemap_62
41000000-42000000 rw-s 00000000 00:27 61162599
/mnt/huge/rtemap_63
3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
/mnt/huge/rtemap_5
3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
/mnt/huge/rtemap_4
3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
/mnt/huge/rtemap_15
3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
/mnt/huge/rtemap_2
3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
/mnt/huge/rtemap_13
3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
/mnt/huge/rtemap_8
3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
/mnt/huge/rtemap_7
3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
/mnt/huge/rtemap_12
3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
/mnt/huge/rtemap_1
3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
/mnt/huge/rtemap_14
3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
/mnt/huge/rtemap_9
3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
/mnt/huge/rtemap_10
3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
/mnt/huge/rtemap_11
3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
/mnt/huge/rtemap_3
3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
/mnt/huge/rtemap_6
3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
/mnt/huge/rtemap_0
3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
/mnt/huge/rtemap_20
3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
/mnt/huge/rtemap_16
3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
/mnt/huge/rtemap_17
3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
/mnt/huge/rtemap_18
3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
/mnt/huge/rtemap_19
3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
/mnt/huge/rtemap_31
3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
/mnt/huge/rtemap_30
3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
/mnt/huge/rtemap_22
3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
/mnt/huge/rtemap_21
3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
/mnt/huge/rtemap_24
3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
/mnt/huge/rtemap_25
3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
/mnt/huge/rtemap_28
3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
/mnt/huge/rtemap_23
3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
/mnt/huge/rtemap_27
3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
/mnt/huge/rtemap_26
3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
/mnt/huge/rtemap_29
3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
/mnt/huge/rtemap_36
3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
/mnt/huge/rtemap_32
3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
/mnt/huge/rtemap_33
3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
/mnt/huge/rtemap_34
3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
/mnt/huge/rtemap_35
3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
/mnt/huge/rtemap_47
3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
/mnt/huge/rtemap_46
3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
/mnt/huge/rtemap_45
3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
/mnt/huge/rtemap_44
3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
/mnt/huge/rtemap_43
3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
/mnt/huge/rtemap_42
3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
/mnt/huge/rtemap_41
3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
/mnt/huge/rtemap_38
3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
/mnt/huge/rtemap_37
3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
/mnt/huge/rtemap_39
3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
/mnt/huge/rtemap_40
3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
/mnt/huge/rtemap_48
3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
/mnt/huge/rtemap_49
3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
/mnt/huge/rtemap_50
3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
/mnt/huge/rtemap_51
3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
/mnt/huge/rtemap_63
3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
/mnt/huge/rtemap_62
3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
/mnt/huge/rtemap_61
3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
/mnt/huge/rtemap_60
3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
/mnt/huge/rtemap_59
3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
/mnt/huge/rtemap_58
3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
/mnt/huge/rtemap_57
3effec000000-3effed000000 rw-s 00000000 00:27 61162592
/mnt/huge/rtemap_56
3effed000000-3effee000000 rw-s 00000000 00:27 61162591
/mnt/huge/rtemap_55
3effee000000-3effef000000 rw-s 00000000 00:27 61162590
/mnt/huge/rtemap_54
3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
/mnt/huge/rtemap_53
3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
/mnt/huge/rtemap_52
3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
/mnt/huge/rtemap_29
3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
/mnt/huge/rtemap_28
3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
/mnt/huge/rtemap_27
3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
/mnt/huge/rtemap_26
3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
/mnt/huge/rtemap_25
3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
/mnt/huge/rtemap_24
3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
/mnt/huge/rtemap_23
3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
/mnt/huge/rtemap_22
3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
/mnt/huge/rtemap_21
3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
/mnt/huge/rtemap_20
3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
/mnt/huge/rtemap_19
3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
/mnt/huge/rtemap_18
3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
/mnt/huge/rtemap_17
3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
/mnt/huge/rtemap_16
3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
/mnt/huge/rtemap_15
3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0 
3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240
/run/.rte_config
3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0 
3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531
/opt/at7.1/lib64/power8/libc-2.19.so
3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531
/opt/at7.1/lib64/power8/libc-2.19.so
3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0 
3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563
/opt/at7.1/lib64/power8/libpthread-2.19.so
3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563
/opt/at7.1/lib64/power8/libpthread-2.19.so
3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210
/opt/at7.1/lib64/libdl-2.19.so
3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210
/opt/at7.1/lib64/libdl-2.19.so
3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
/opt/at7.1/lib64/power8/libz.so.1.2.6
3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
/opt/at7.1/lib64/power8/libz.so.1.2.6
3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568
/opt/at7.1/lib64/power8/libm-2.19.so
3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568
/opt/at7.1/lib64/power8/libm-2.19.so
3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0
[vdso]
3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048
/opt/at7.1/lib64/ld-2.19.so
3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048
/opt/at7.1/lib64/ld-2.19.so
3ffffffd0000-400000000000 rw-p 00000000 00:00 0
[stack]


-----Original Message-----
From: Bruce Richardson [mailto:bruce.richardson@intel.com] 
Sent: 2016年3月23日 1:11
To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org;
chaozhu@linux.vnet.ibm.com; David Marchand <david.marchand@6wind.com>
Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map
hugepages in correct order

On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
> First of all, forgive my ignorance regarding ppc64 and if the 
> questions are naive but after having a look to the already existing 
> code for ppc64 and this patch now, why are we doing this reverse 
> mapping at all?
> 
> I guess the question revolves around the comment in eal_memory.c:
> 1316                 /* On PPC64 architecture, the mmap always start from
> higher
> 1317                  * virtual address to lower address. Here, both the
> physical
> 1318                  * address and virtual address are in descending
order
> */
> 
> From looking at the code, for ppc64 we do qsort in reverse order and 
> thereafter everything looks to be is done to account for that reverse 
> sorting.
> 
> CC: Chao Zhu and David Marchand as original author and reviewer of the
code.
> 
> Sergio
>

Just to add my 2c here. At one point, with I believe some i686 installs -
don't remember the specific OS/kernel, we found that the mmap calls were
returning the highest free address first and then working downwards - must
like seems to be described here. To fix this we changed the mmap code from
assuming that addresses are mapped upwards, to instead explicitly requesting
a large free block of memory (mmap of /dev/zero) to find a free address
space range of the correct size, and then explicitly mmapping each
individual page to the appropriate place in that free range. With this
scheme it didn't matter whether the OS tried to mmap the pages from the
highest or lowest address because we always told the OS where to put the
page (and we knew the slot was free from the earlier block mmap).
Would this scheme not also work for PPC in a similar way? (Again, forgive
unfamiliarity with PPC! :-) )

/Bruce

> 
> On 07/03/2016 14:13, Gowrishankar wrote:
> >From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
> >
> >For a secondary process address space to map hugepages from every 
> >segment of primary process, hugepage_file entries has to be mapped 
> >reversely from the list that primary process updated for every 
> >segment. This is for a reason that, in ppc64, hugepages are sorted for
decrementing addresses.
> >
> >Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> >---
> >  lib/librte_eal/linuxapp/eal/eal_memory.c |   26
++++++++++++++++----------
> >  1 file changed, 16 insertions(+), 10 deletions(-)
> >
> >diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
> >b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >index 5b9132c..6aea5d0 100644
> >--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> >+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> >@@ -1400,7 +1400,7 @@ rte_eal_hugepage_attach(void)
> >  {
> >  	const struct rte_mem_config *mcfg =
rte_eal_get_configuration()->mem_config;
> >  	const struct hugepage_file *hp = NULL;
> >-	unsigned num_hp = 0;
> >+	unsigned num_hp = 0, mapped_hp = 0;
> >  	unsigned i, s = 0; /* s used to track the segment number */
> >  	off_t size;
> >  	int fd, fd_zero = -1, fd_hugepage = -1; @@ -1486,14 +1486,12 @@ 
> >rte_eal_hugepage_attach(void)
> >  		goto error;
> >  	}
> >-	num_hp = size / sizeof(struct hugepage_file);
> >-	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
> >-
> >  	s = 0;
> >  	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
> >  		void *addr, *base_addr;
> >  		uintptr_t offset = 0;
> >  		size_t mapping_size;
> >+		unsigned int index;
> >  #ifdef RTE_LIBRTE_IVSHMEM
> >  		/*
> >  		 * if segment has ioremap address set, it's an IVSHMEM
segment 
> >and @@ -1504,6 +1502,8 @@ rte_eal_hugepage_attach(void)
> >  			continue;
> >  		}
> >  #endif
> >+		num_hp = mcfg->memseg[s].len / mcfg->memseg[s].hugepage_sz;
> >+		RTE_LOG(DEBUG, EAL, "Analysing %u files in segment %u\n",
num_hp, 
> >+s);
> >  		/*
> >  		 * free previously mapped memory so we can map the
> >  		 * hugepages into the space
> >@@ -1514,18 +1514,23 @@ rte_eal_hugepage_attach(void)
> >  		/* find the hugepages for this segment and map them
> >  		 * we don't need to worry about order, as the server sorted
the
> >  		 * entries before it did the second mmap of them */
> >+#ifdef RTE_ARCH_PPC_64
> >+		for (i = num_hp-1; i < num_hp && offset <
mcfg->memseg[s].len; 
> >+i--){ #else
> >  		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len;
i++){
> >-			if (hp[i].memseg_id == (int)s){
> >-				fd = open(hp[i].filepath, O_RDWR);
> >+#endif
> >+			index = i + mapped_hp;
> >+			if (hp[index].memseg_id == (int)s){
> >+				fd = open(hp[index].filepath, O_RDWR);
> >  				if (fd < 0) {
> >  					RTE_LOG(ERR, EAL, "Could not open
%s\n",
> >-						hp[i].filepath);
> >+						hp[index].filepath);
> >  					goto error;
> >  				}
> >  #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> >-				mapping_size = hp[i].size * hp[i].repeated;
> >+				mapping_size = hp[index].size *
hp[index].repeated;
> >  #else
> >-				mapping_size = hp[i].size;
> >+				mapping_size = hp[index].size;
> >  #endif
> >  				addr = mmap(RTE_PTR_ADD(base_addr, offset),
> >  						mapping_size, PROT_READ |
PROT_WRITE, @@ -1534,7 +1539,7 @@ 
> >rte_eal_hugepage_attach(void)
> >  				if (addr == MAP_FAILED ||
> >  						addr !=
RTE_PTR_ADD(base_addr, offset)) {
> >  					RTE_LOG(ERR, EAL, "Could not mmap
%s\n",
> >-						hp[i].filepath);
> >+						hp[index].filepath);
> >  					goto error;
> >  				}
> >  				offset+=mapping_size;
> >@@ -1543,6 +1548,7 @@ rte_eal_hugepage_attach(void)
> >  		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
> >  				(unsigned long long)mcfg->memseg[s].len);
> >  		s++;
> >+		mapped_hp += num_hp;
> >  	}
> >  	/* unmap the hugepage config file, since we are done using it */
> >  	munmap((void *)(uintptr_t)hp, size);
>
  
Sergio Gonzalez Monroy May 20, 2016, 8:01 a.m. UTC | #7
On 20/05/2016 04:03, Chao Zhu wrote:
> Bruce,
>
> Recently, we find some bugs with mmap in PowerLinux. The mmap doesn't
> respect the address hints. In function get_virtual_area() in eal_memory.c,
> mmap get the free virtual address range as the address hint. However, when
> mapping the real memory in rte_eal_hugepage_init(), mmap doesn't return the
> same address as the requested address. When taking a look at the
> /proc/<pid>/maps, the requested address range is free for use. With this
> bug, pre-allocate some free space doesn't work.

Hi Chao,

If I understand you correctly, the issue you are describing would cause 
DPDK to
fail initialization even with the reverse reordering that you are doing 
for PPC.

Basically (just showing relevant initialization steps):
1. map_all_hugepages(..., orig=1)
     - map all hugepages
2. find physical address for each hugepage
3. sort by physical address
4. map_all_hugepages(..., orig=0)
     - Now we try to get big chunk of virtual address for a block of 
contig hugepages
        so we know we have that virtual address chunk available.
     - Then we try to remap each page of that block of contig pages into 
that
        virtual address chunk.

So the issue you are describing would make step 4 fail regardless of the 
different
ordering that PPC does.
I'm probably missing something, would you care to elaborate?

Sergio


> We're trying to create some test case and report it as a bug to kernel
> community.
>
> Here's some logs:
> ===============================
> EAL: Ask a virtual area of 0x10000000 bytes
> EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
> EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000  requested
> addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000  requested
> addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000  requested
> addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000  requested
> addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000  requested
> addr: 0x3fffab000000  mmaped addr: 0x3effec000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000  requested
> addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000  requested
> addr: 0x3fffad000000  mmaped addr: 0x3effea000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000  requested
> addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000  requested
> addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000  requested
> addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000 requested
> addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000 requested
> addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000 requested
> addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000 requested
> addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000 requested
> addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000 requested
> addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000
>
> # cat /proc/143765/maps
> 01000000-02000000 rw-s 00000000 00:27 61162550
> /mnt/huge/rtemap_14
> 02000000-03000000 rw-s 00000000 00:27 61162536
> /mnt/huge/rtemap_0
> 03000000-04000000 rw-s 00000000 00:27 61162537
> /mnt/huge/rtemap_1
> 04000000-05000000 rw-s 00000000 00:27 61162538
> /mnt/huge/rtemap_2
> 05000000-06000000 rw-s 00000000 00:27 61162539
> /mnt/huge/rtemap_3
> 06000000-07000000 rw-s 00000000 00:27 61162540
> /mnt/huge/rtemap_4
> 07000000-08000000 rw-s 00000000 00:27 61162541
> /mnt/huge/rtemap_5
> 08000000-09000000 rw-s 00000000 00:27 61162542
> /mnt/huge/rtemap_6
> 09000000-0a000000 rw-s 00000000 00:27 61162543
> /mnt/huge/rtemap_7
> 0a000000-0b000000 rw-s 00000000 00:27 61162544
> /mnt/huge/rtemap_8
> 0b000000-0c000000 rw-s 00000000 00:27 61162545
> /mnt/huge/rtemap_9
> 0c000000-0d000000 rw-s 00000000 00:27 61162546
> /mnt/huge/rtemap_10
> 0d000000-0e000000 rw-s 00000000 00:27 61162547
> /mnt/huge/rtemap_11
> 0e000000-0f000000 rw-s 00000000 00:27 61162548
> /mnt/huge/rtemap_12
> 0f000000-10000000 rw-s 00000000 00:27 61162549
> /mnt/huge/rtemap_13
> 10000000-101f0000 r-xp 00000000 08:32 6040458
> /home/dpdk/build/app/test
> 101f0000-10220000 rw-p 001f0000 08:32 6040458
> /home/dpdk/build/app/test
> 10220000-15c20000 rw-p 00000000 00:00 0
> [heap]
> 20000000-21000000 rw-s 00000000 00:27 61162566
> /mnt/huge/rtemap_30
> 21000000-22000000 rw-s 00000000 00:27 61162567
> /mnt/huge/rtemap_31
> 22000000-23000000 rw-s 00000000 00:27 61162568
> /mnt/huge/rtemap_32
> 23000000-24000000 rw-s 00000000 00:27 61162569
> /mnt/huge/rtemap_33
> 24000000-25000000 rw-s 00000000 00:27 61162570
> /mnt/huge/rtemap_34
> 25000000-26000000 rw-s 00000000 00:27 61162571
> /mnt/huge/rtemap_35
> 26000000-27000000 rw-s 00000000 00:27 61162572
> /mnt/huge/rtemap_36
> 27000000-28000000 rw-s 00000000 00:27 61162573
> /mnt/huge/rtemap_37
> 28000000-29000000 rw-s 00000000 00:27 61162574
> /mnt/huge/rtemap_38
> 29000000-2a000000 rw-s 00000000 00:27 61162575
> /mnt/huge/rtemap_39
> 2a000000-2b000000 rw-s 00000000 00:27 61162576
> /mnt/huge/rtemap_40
> 2b000000-2c000000 rw-s 00000000 00:27 61162577
> /mnt/huge/rtemap_41
> 2c000000-2d000000 rw-s 00000000 00:27 61162578
> /mnt/huge/rtemap_42
> 2d000000-2e000000 rw-s 00000000 00:27 61162579
> /mnt/huge/rtemap_43
> 2e000000-2f000000 rw-s 00000000 00:27 61162580
> /mnt/huge/rtemap_44
> 2f000000-30000000 rw-s 00000000 00:27 61162581
> /mnt/huge/rtemap_45
> 30000000-31000000 rw-s 00000000 00:27 61162582
> /mnt/huge/rtemap_46
> 31000000-32000000 rw-s 00000000 00:27 61162583
> /mnt/huge/rtemap_47
> 32000000-33000000 rw-s 00000000 00:27 61162584
> /mnt/huge/rtemap_48
> 33000000-34000000 rw-s 00000000 00:27 61162585
> /mnt/huge/rtemap_49
> 34000000-35000000 rw-s 00000000 00:27 61162586
> /mnt/huge/rtemap_50
> 35000000-36000000 rw-s 00000000 00:27 61162587
> /mnt/huge/rtemap_51
> 36000000-37000000 rw-s 00000000 00:27 61162588
> /mnt/huge/rtemap_52
> 37000000-38000000 rw-s 00000000 00:27 61162589
> /mnt/huge/rtemap_53
> 38000000-39000000 rw-s 00000000 00:27 61162590
> /mnt/huge/rtemap_54
> 39000000-3a000000 rw-s 00000000 00:27 61162591
> /mnt/huge/rtemap_55
> 3a000000-3b000000 rw-s 00000000 00:27 61162592
> /mnt/huge/rtemap_56
> 3b000000-3c000000 rw-s 00000000 00:27 61162593
> /mnt/huge/rtemap_57
> 3c000000-3d000000 rw-s 00000000 00:27 61162594
> /mnt/huge/rtemap_58
> 3d000000-3e000000 rw-s 00000000 00:27 61162595
> /mnt/huge/rtemap_59
> 3e000000-3f000000 rw-s 00000000 00:27 61162596
> /mnt/huge/rtemap_60
> 3f000000-40000000 rw-s 00000000 00:27 61162597
> /mnt/huge/rtemap_61
> 40000000-41000000 rw-s 00000000 00:27 61162598
> /mnt/huge/rtemap_62
> 41000000-42000000 rw-s 00000000 00:27 61162599
> /mnt/huge/rtemap_63
> 3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
> /mnt/huge/rtemap_5
> 3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
> /mnt/huge/rtemap_4
> 3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
> /mnt/huge/rtemap_15
> 3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
> /mnt/huge/rtemap_2
> 3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
> /mnt/huge/rtemap_13
> 3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
> /mnt/huge/rtemap_8
> 3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
> /mnt/huge/rtemap_7
> 3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
> /mnt/huge/rtemap_12
> 3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
> /mnt/huge/rtemap_1
> 3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
> /mnt/huge/rtemap_14
> 3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
> /mnt/huge/rtemap_9
> 3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
> /mnt/huge/rtemap_10
> 3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
> /mnt/huge/rtemap_11
> 3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
> /mnt/huge/rtemap_3
> 3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
> /mnt/huge/rtemap_6
> 3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
> /mnt/huge/rtemap_0
> 3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
> /mnt/huge/rtemap_20
> 3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
> /mnt/huge/rtemap_16
> 3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
> /mnt/huge/rtemap_17
> 3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
> /mnt/huge/rtemap_18
> 3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
> /mnt/huge/rtemap_19
> 3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
> /mnt/huge/rtemap_31
> 3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
> /mnt/huge/rtemap_30
> 3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
> /mnt/huge/rtemap_22
> 3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
> /mnt/huge/rtemap_21
> 3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
> /mnt/huge/rtemap_24
> 3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
> /mnt/huge/rtemap_25
> 3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
> /mnt/huge/rtemap_28
> 3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
> /mnt/huge/rtemap_23
> 3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
> /mnt/huge/rtemap_27
> 3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
> /mnt/huge/rtemap_26
> 3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
> /mnt/huge/rtemap_29
> 3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
> /mnt/huge/rtemap_36
> 3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
> /mnt/huge/rtemap_32
> 3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
> /mnt/huge/rtemap_33
> 3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
> /mnt/huge/rtemap_34
> 3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
> /mnt/huge/rtemap_35
> 3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
> /mnt/huge/rtemap_47
> 3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
> /mnt/huge/rtemap_46
> 3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
> /mnt/huge/rtemap_45
> 3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
> /mnt/huge/rtemap_44
> 3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
> /mnt/huge/rtemap_43
> 3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
> /mnt/huge/rtemap_42
> 3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
> /mnt/huge/rtemap_41
> 3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
> /mnt/huge/rtemap_38
> 3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
> /mnt/huge/rtemap_37
> 3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
> /mnt/huge/rtemap_39
> 3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
> /mnt/huge/rtemap_40
> 3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
> /mnt/huge/rtemap_48
> 3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
> /mnt/huge/rtemap_49
> 3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
> /mnt/huge/rtemap_50
> 3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
> /mnt/huge/rtemap_51
> 3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
> /mnt/huge/rtemap_63
> 3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
> /mnt/huge/rtemap_62
> 3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
> /mnt/huge/rtemap_61
> 3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
> /mnt/huge/rtemap_60
> 3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
> /mnt/huge/rtemap_59
> 3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
> /mnt/huge/rtemap_58
> 3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
> /mnt/huge/rtemap_57
> 3effec000000-3effed000000 rw-s 00000000 00:27 61162592
> /mnt/huge/rtemap_56
> 3effed000000-3effee000000 rw-s 00000000 00:27 61162591
> /mnt/huge/rtemap_55
> 3effee000000-3effef000000 rw-s 00000000 00:27 61162590
> /mnt/huge/rtemap_54
> 3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
> /mnt/huge/rtemap_53
> 3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
> /mnt/huge/rtemap_52
> 3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
> /mnt/huge/rtemap_29
> 3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
> /mnt/huge/rtemap_28
> 3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
> /mnt/huge/rtemap_27
> 3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
> /mnt/huge/rtemap_26
> 3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
> /mnt/huge/rtemap_25
> 3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
> /mnt/huge/rtemap_24
> 3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
> /mnt/huge/rtemap_23
> 3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
> /mnt/huge/rtemap_22
> 3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
> /mnt/huge/rtemap_21
> 3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
> /mnt/huge/rtemap_20
> 3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
> /mnt/huge/rtemap_19
> 3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
> /mnt/huge/rtemap_18
> 3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
> /mnt/huge/rtemap_17
> 3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
> /mnt/huge/rtemap_16
> 3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
> /mnt/huge/rtemap_15
> 3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0
> 3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240
> /run/.rte_config
> 3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0
> 3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531
> /opt/at7.1/lib64/power8/libc-2.19.so
> 3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531
> /opt/at7.1/lib64/power8/libc-2.19.so
> 3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0
> 3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563
> /opt/at7.1/lib64/power8/libpthread-2.19.so
> 3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563
> /opt/at7.1/lib64/power8/libpthread-2.19.so
> 3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210
> /opt/at7.1/lib64/libdl-2.19.so
> 3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210
> /opt/at7.1/lib64/libdl-2.19.so
> 3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
> /opt/at7.1/lib64/power8/libz.so.1.2.6
> 3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
> /opt/at7.1/lib64/power8/libz.so.1.2.6
> 3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568
> /opt/at7.1/lib64/power8/libm-2.19.so
> 3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568
> /opt/at7.1/lib64/power8/libm-2.19.so
> 3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0
> [vdso]
> 3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048
> /opt/at7.1/lib64/ld-2.19.so
> 3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048
> /opt/at7.1/lib64/ld-2.19.so
> 3ffffffd0000-400000000000 rw-p 00000000 00:00 0
> [stack]
>
>
> -----Original Message-----
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: 2016年3月23日 1:11
> To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
> Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org;
> chaozhu@linux.vnet.ibm.com; David Marchand <david.marchand@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map
> hugepages in correct order
>
> On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
>> First of all, forgive my ignorance regarding ppc64 and if the
>> questions are naive but after having a look to the already existing
>> code for ppc64 and this patch now, why are we doing this reverse
>> mapping at all?
>>
>> I guess the question revolves around the comment in eal_memory.c:
>> 1316                 /* On PPC64 architecture, the mmap always start from
>> higher
>> 1317                  * virtual address to lower address. Here, both the
>> physical
>> 1318                  * address and virtual address are in descending
> order
>> */
>>
>>  From looking at the code, for ppc64 we do qsort in reverse order and
>> thereafter everything looks to be is done to account for that reverse
>> sorting.
>>
>> CC: Chao Zhu and David Marchand as original author and reviewer of the
> code.
>> Sergio
>>
> Just to add my 2c here. At one point, with I believe some i686 installs -
> don't remember the specific OS/kernel, we found that the mmap calls were
> returning the highest free address first and then working downwards - must
> like seems to be described here. To fix this we changed the mmap code from
> assuming that addresses are mapped upwards, to instead explicitly requesting
> a large free block of memory (mmap of /dev/zero) to find a free address
> space range of the correct size, and then explicitly mmapping each
> individual page to the appropriate place in that free range. With this
> scheme it didn't matter whether the OS tried to mmap the pages from the
> highest or lowest address because we always told the OS where to put the
> page (and we knew the slot was free from the earlier block mmap).
> Would this scheme not also work for PPC in a similar way? (Again, forgive
> unfamiliarity with PPC! :-) )
>
> /Bruce
>
>> On 07/03/2016 14:13, Gowrishankar wrote:
>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>>>
>>> For a secondary process address space to map hugepages from every
>>> segment of primary process, hugepage_file entries has to be mapped
>>> reversely from the list that primary process updated for every
>>> segment. This is for a reason that, in ppc64, hugepages are sorted for
> decrementing addresses.
>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
>>> ---
  
Chao Zhu May 20, 2016, 8:41 a.m. UTC | #8
Sergio,

The step 4 will not fail because each huge page will get an virtual address finally, though it's a different address. If you take a look at the function rte_eal_hugepage_init(), in the last loop, it uses both physical address and virtual address to determine a new memory segment. This step can make sure that the initialization is correct. What I want to say is, this bug also influence the secondary process in function rte_eal_hugepage_attach(). It can make the secondary process fail to init. I'm trying to figure out how to make it work.

-----Original Message-----
From: Sergio Gonzalez Monroy [mailto:sergio.gonzalez.monroy@intel.com] 
Sent: 2016年5月20日 16:01
To: Chao Zhu <chaozhu@linux.vnet.ibm.com>; 'Bruce Richardson' <bruce.richardson@intel.com>
Cc: 'Gowrishankar' <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org; 'David Marchand' <david.marchand@6wind.com>
Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map hugepages in correct order

On 20/05/2016 04:03, Chao Zhu wrote:
> Bruce,
>
> Recently, we find some bugs with mmap in PowerLinux. The mmap doesn't 
> respect the address hints. In function get_virtual_area() in 
> eal_memory.c, mmap get the free virtual address range as the address 
> hint. However, when mapping the real memory in 
> rte_eal_hugepage_init(), mmap doesn't return the same address as the 
> requested address. When taking a look at the /proc/<pid>/maps, the 
> requested address range is free for use. With this bug, pre-allocate some free space doesn't work.

Hi Chao,

If I understand you correctly, the issue you are describing would cause DPDK to fail initialization even with the reverse reordering that you are doing for PPC.

Basically (just showing relevant initialization steps):
1. map_all_hugepages(..., orig=1)
     - map all hugepages
2. find physical address for each hugepage 3. sort by physical address 4. map_all_hugepages(..., orig=0)
     - Now we try to get big chunk of virtual address for a block of contig hugepages
        so we know we have that virtual address chunk available.
     - Then we try to remap each page of that block of contig pages into that
        virtual address chunk.

So the issue you are describing would make step 4 fail regardless of the different ordering that PPC does.
I'm probably missing something, would you care to elaborate?

Sergio


> We're trying to create some test case and report it as a bug to kernel 
> community.
>
> Here's some logs:
> ===============================
> EAL: Ask a virtual area of 0x10000000 bytes
> EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
> EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000  
> requested
> addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000  
> requested
> addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000  
> requested
> addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000  
> requested
> addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000  
> requested
> addr: 0x3fffab000000  mmaped addr: 0x3effec000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000  
> requested
> addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000  
> requested
> addr: 0x3fffad000000  mmaped addr: 0x3effea000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000  
> requested
> addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000  
> requested
> addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000  
> requested
> addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000 
> requested
> addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000 
> requested
> addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000 
> requested
> addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000 
> requested
> addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000 
> requested
> addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
> EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000 
> requested
> addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000
>
> # cat /proc/143765/maps
> 01000000-02000000 rw-s 00000000 00:27 61162550
> /mnt/huge/rtemap_14
> 02000000-03000000 rw-s 00000000 00:27 61162536
> /mnt/huge/rtemap_0
> 03000000-04000000 rw-s 00000000 00:27 61162537
> /mnt/huge/rtemap_1
> 04000000-05000000 rw-s 00000000 00:27 61162538
> /mnt/huge/rtemap_2
> 05000000-06000000 rw-s 00000000 00:27 61162539
> /mnt/huge/rtemap_3
> 06000000-07000000 rw-s 00000000 00:27 61162540
> /mnt/huge/rtemap_4
> 07000000-08000000 rw-s 00000000 00:27 61162541
> /mnt/huge/rtemap_5
> 08000000-09000000 rw-s 00000000 00:27 61162542
> /mnt/huge/rtemap_6
> 09000000-0a000000 rw-s 00000000 00:27 61162543
> /mnt/huge/rtemap_7
> 0a000000-0b000000 rw-s 00000000 00:27 61162544
> /mnt/huge/rtemap_8
> 0b000000-0c000000 rw-s 00000000 00:27 61162545
> /mnt/huge/rtemap_9
> 0c000000-0d000000 rw-s 00000000 00:27 61162546
> /mnt/huge/rtemap_10
> 0d000000-0e000000 rw-s 00000000 00:27 61162547
> /mnt/huge/rtemap_11
> 0e000000-0f000000 rw-s 00000000 00:27 61162548
> /mnt/huge/rtemap_12
> 0f000000-10000000 rw-s 00000000 00:27 61162549
> /mnt/huge/rtemap_13
> 10000000-101f0000 r-xp 00000000 08:32 6040458 
> /home/dpdk/build/app/test
> 101f0000-10220000 rw-p 001f0000 08:32 6040458 
> /home/dpdk/build/app/test
> 10220000-15c20000 rw-p 00000000 00:00 0 [heap]
> 20000000-21000000 rw-s 00000000 00:27 61162566
> /mnt/huge/rtemap_30
> 21000000-22000000 rw-s 00000000 00:27 61162567
> /mnt/huge/rtemap_31
> 22000000-23000000 rw-s 00000000 00:27 61162568
> /mnt/huge/rtemap_32
> 23000000-24000000 rw-s 00000000 00:27 61162569
> /mnt/huge/rtemap_33
> 24000000-25000000 rw-s 00000000 00:27 61162570
> /mnt/huge/rtemap_34
> 25000000-26000000 rw-s 00000000 00:27 61162571
> /mnt/huge/rtemap_35
> 26000000-27000000 rw-s 00000000 00:27 61162572
> /mnt/huge/rtemap_36
> 27000000-28000000 rw-s 00000000 00:27 61162573
> /mnt/huge/rtemap_37
> 28000000-29000000 rw-s 00000000 00:27 61162574
> /mnt/huge/rtemap_38
> 29000000-2a000000 rw-s 00000000 00:27 61162575
> /mnt/huge/rtemap_39
> 2a000000-2b000000 rw-s 00000000 00:27 61162576
> /mnt/huge/rtemap_40
> 2b000000-2c000000 rw-s 00000000 00:27 61162577
> /mnt/huge/rtemap_41
> 2c000000-2d000000 rw-s 00000000 00:27 61162578
> /mnt/huge/rtemap_42
> 2d000000-2e000000 rw-s 00000000 00:27 61162579
> /mnt/huge/rtemap_43
> 2e000000-2f000000 rw-s 00000000 00:27 61162580
> /mnt/huge/rtemap_44
> 2f000000-30000000 rw-s 00000000 00:27 61162581
> /mnt/huge/rtemap_45
> 30000000-31000000 rw-s 00000000 00:27 61162582
> /mnt/huge/rtemap_46
> 31000000-32000000 rw-s 00000000 00:27 61162583
> /mnt/huge/rtemap_47
> 32000000-33000000 rw-s 00000000 00:27 61162584
> /mnt/huge/rtemap_48
> 33000000-34000000 rw-s 00000000 00:27 61162585
> /mnt/huge/rtemap_49
> 34000000-35000000 rw-s 00000000 00:27 61162586
> /mnt/huge/rtemap_50
> 35000000-36000000 rw-s 00000000 00:27 61162587
> /mnt/huge/rtemap_51
> 36000000-37000000 rw-s 00000000 00:27 61162588
> /mnt/huge/rtemap_52
> 37000000-38000000 rw-s 00000000 00:27 61162589
> /mnt/huge/rtemap_53
> 38000000-39000000 rw-s 00000000 00:27 61162590
> /mnt/huge/rtemap_54
> 39000000-3a000000 rw-s 00000000 00:27 61162591
> /mnt/huge/rtemap_55
> 3a000000-3b000000 rw-s 00000000 00:27 61162592
> /mnt/huge/rtemap_56
> 3b000000-3c000000 rw-s 00000000 00:27 61162593
> /mnt/huge/rtemap_57
> 3c000000-3d000000 rw-s 00000000 00:27 61162594
> /mnt/huge/rtemap_58
> 3d000000-3e000000 rw-s 00000000 00:27 61162595
> /mnt/huge/rtemap_59
> 3e000000-3f000000 rw-s 00000000 00:27 61162596
> /mnt/huge/rtemap_60
> 3f000000-40000000 rw-s 00000000 00:27 61162597
> /mnt/huge/rtemap_61
> 40000000-41000000 rw-s 00000000 00:27 61162598
> /mnt/huge/rtemap_62
> 41000000-42000000 rw-s 00000000 00:27 61162599
> /mnt/huge/rtemap_63
> 3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
> /mnt/huge/rtemap_5
> 3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
> /mnt/huge/rtemap_4
> 3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
> /mnt/huge/rtemap_15
> 3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
> /mnt/huge/rtemap_2
> 3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
> /mnt/huge/rtemap_13
> 3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
> /mnt/huge/rtemap_8
> 3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
> /mnt/huge/rtemap_7
> 3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
> /mnt/huge/rtemap_12
> 3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
> /mnt/huge/rtemap_1
> 3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
> /mnt/huge/rtemap_14
> 3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
> /mnt/huge/rtemap_9
> 3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
> /mnt/huge/rtemap_10
> 3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
> /mnt/huge/rtemap_11
> 3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
> /mnt/huge/rtemap_3
> 3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
> /mnt/huge/rtemap_6
> 3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
> /mnt/huge/rtemap_0
> 3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
> /mnt/huge/rtemap_20
> 3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
> /mnt/huge/rtemap_16
> 3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
> /mnt/huge/rtemap_17
> 3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
> /mnt/huge/rtemap_18
> 3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
> /mnt/huge/rtemap_19
> 3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
> /mnt/huge/rtemap_31
> 3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
> /mnt/huge/rtemap_30
> 3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
> /mnt/huge/rtemap_22
> 3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
> /mnt/huge/rtemap_21
> 3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
> /mnt/huge/rtemap_24
> 3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
> /mnt/huge/rtemap_25
> 3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
> /mnt/huge/rtemap_28
> 3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
> /mnt/huge/rtemap_23
> 3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
> /mnt/huge/rtemap_27
> 3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
> /mnt/huge/rtemap_26
> 3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
> /mnt/huge/rtemap_29
> 3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
> /mnt/huge/rtemap_36
> 3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
> /mnt/huge/rtemap_32
> 3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
> /mnt/huge/rtemap_33
> 3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
> /mnt/huge/rtemap_34
> 3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
> /mnt/huge/rtemap_35
> 3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
> /mnt/huge/rtemap_47
> 3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
> /mnt/huge/rtemap_46
> 3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
> /mnt/huge/rtemap_45
> 3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
> /mnt/huge/rtemap_44
> 3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
> /mnt/huge/rtemap_43
> 3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
> /mnt/huge/rtemap_42
> 3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
> /mnt/huge/rtemap_41
> 3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
> /mnt/huge/rtemap_38
> 3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
> /mnt/huge/rtemap_37
> 3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
> /mnt/huge/rtemap_39
> 3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
> /mnt/huge/rtemap_40
> 3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
> /mnt/huge/rtemap_48
> 3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
> /mnt/huge/rtemap_49
> 3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
> /mnt/huge/rtemap_50
> 3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
> /mnt/huge/rtemap_51
> 3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
> /mnt/huge/rtemap_63
> 3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
> /mnt/huge/rtemap_62
> 3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
> /mnt/huge/rtemap_61
> 3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
> /mnt/huge/rtemap_60
> 3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
> /mnt/huge/rtemap_59
> 3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
> /mnt/huge/rtemap_58
> 3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
> /mnt/huge/rtemap_57
> 3effec000000-3effed000000 rw-s 00000000 00:27 61162592
> /mnt/huge/rtemap_56
> 3effed000000-3effee000000 rw-s 00000000 00:27 61162591
> /mnt/huge/rtemap_55
> 3effee000000-3effef000000 rw-s 00000000 00:27 61162590
> /mnt/huge/rtemap_54
> 3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
> /mnt/huge/rtemap_53
> 3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
> /mnt/huge/rtemap_52
> 3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
> /mnt/huge/rtemap_29
> 3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
> /mnt/huge/rtemap_28
> 3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
> /mnt/huge/rtemap_27
> 3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
> /mnt/huge/rtemap_26
> 3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
> /mnt/huge/rtemap_25
> 3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
> /mnt/huge/rtemap_24
> 3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
> /mnt/huge/rtemap_23
> 3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
> /mnt/huge/rtemap_22
> 3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
> /mnt/huge/rtemap_21
> 3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
> /mnt/huge/rtemap_20
> 3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
> /mnt/huge/rtemap_19
> 3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
> /mnt/huge/rtemap_18
> 3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
> /mnt/huge/rtemap_17
> 3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
> /mnt/huge/rtemap_16
> 3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
> /mnt/huge/rtemap_15
> 3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0
> 3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240 /run/.rte_config
> 3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0
> 3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531 
> /opt/at7.1/lib64/power8/libc-2.19.so
> 3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531 
> /opt/at7.1/lib64/power8/libc-2.19.so
> 3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0
> 3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563 
> /opt/at7.1/lib64/power8/libpthread-2.19.so
> 3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563 
> /opt/at7.1/lib64/power8/libpthread-2.19.so
> 3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210 
> /opt/at7.1/lib64/libdl-2.19.so
> 3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210 
> /opt/at7.1/lib64/libdl-2.19.so
> 3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
> /opt/at7.1/lib64/power8/libz.so.1.2.6
> 3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
> /opt/at7.1/lib64/power8/libz.so.1.2.6
> 3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568 
> /opt/at7.1/lib64/power8/libm-2.19.so
> 3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568 
> /opt/at7.1/lib64/power8/libm-2.19.so
> 3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0 [vdso]
> 3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048 
> /opt/at7.1/lib64/ld-2.19.so
> 3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048 
> /opt/at7.1/lib64/ld-2.19.so
> 3ffffffd0000-400000000000 rw-p 00000000 00:00 0 [stack]
>
>
> -----Original Message-----
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: 2016年3月23日 1:11
> To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
> Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org; 
> chaozhu@linux.vnet.ibm.com; David Marchand <david.marchand@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map 
> hugepages in correct order
>
> On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
>> First of all, forgive my ignorance regarding ppc64 and if the 
>> questions are naive but after having a look to the already existing 
>> code for ppc64 and this patch now, why are we doing this reverse 
>> mapping at all?
>>
>> I guess the question revolves around the comment in eal_memory.c:
>> 1316                 /* On PPC64 architecture, the mmap always start from
>> higher
>> 1317                  * virtual address to lower address. Here, both the
>> physical
>> 1318                  * address and virtual address are in descending
> order
>> */
>>
>>  From looking at the code, for ppc64 we do qsort in reverse order and 
>> thereafter everything looks to be is done to account for that reverse 
>> sorting.
>>
>> CC: Chao Zhu and David Marchand as original author and reviewer of 
>> the
> code.
>> Sergio
>>
> Just to add my 2c here. At one point, with I believe some i686 
> installs - don't remember the specific OS/kernel, we found that the 
> mmap calls were returning the highest free address first and then 
> working downwards - must like seems to be described here. To fix this 
> we changed the mmap code from assuming that addresses are mapped 
> upwards, to instead explicitly requesting a large free block of memory 
> (mmap of /dev/zero) to find a free address space range of the correct 
> size, and then explicitly mmapping each individual page to the 
> appropriate place in that free range. With this scheme it didn't 
> matter whether the OS tried to mmap the pages from the highest or 
> lowest address because we always told the OS where to put the page (and we knew the slot was free from the earlier block mmap).
> Would this scheme not also work for PPC in a similar way? (Again, 
> forgive unfamiliarity with PPC! :-) )
>
> /Bruce
>
>> On 07/03/2016 14:13, Gowrishankar wrote:
>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>>>
>>> For a secondary process address space to map hugepages from every 
>>> segment of primary process, hugepage_file entries has to be mapped 
>>> reversely from the list that primary process updated for every 
>>> segment. This is for a reason that, in ppc64, hugepages are sorted 
>>> for
> decrementing addresses.
>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
>>> ---
  
Sergio Gonzalez Monroy May 20, 2016, 10:25 a.m. UTC | #9
On 20/05/2016 09:41, Chao Zhu wrote:
> Sergio,
>
> The step 4 will not fail because each huge page will get an virtual address finally, though it's a different address. If you take a look at the function rte_eal_hugepage_init(), in the last loop, it uses both physical address and virtual address to determine a new memory segment. This step can make sure that the initialization is correct. What I want to say is, this bug also influence the secondary process in function rte_eal_hugepage_attach(). It can make the secondary process fail to init. I'm trying to figure out how to make it work.

You are right, I misread the code.

So basically because mmap ignores the hint to mmap on the requested address,
by default we get VA maps in decreasing address order.

Knowing that, PPC orders pages by decreasing physical address order so when
this happens we actually get hugepages in order in the "new" final_va.

Not sure if that makes sense but I think I understand where you are 
coming from.

I think we need to document this as know issue and/or add comments regarding
this behavior , basically calling out that all this "reverse-ordering" 
is required
because mmap fails to map on the requested VA.

Thanks,
Sergio

> -----Original Message-----
> From: Sergio Gonzalez Monroy [mailto:sergio.gonzalez.monroy@intel.com]
> Sent: 2016年5月20日 16:01
> To: Chao Zhu <chaozhu@linux.vnet.ibm.com>; 'Bruce Richardson' <bruce.richardson@intel.com>
> Cc: 'Gowrishankar' <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org; 'David Marchand' <david.marchand@6wind.com>
> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map hugepages in correct order
>
> On 20/05/2016 04:03, Chao Zhu wrote:
>> Bruce,
>>
>> Recently, we find some bugs with mmap in PowerLinux. The mmap doesn't
>> respect the address hints. In function get_virtual_area() in
>> eal_memory.c, mmap get the free virtual address range as the address
>> hint. However, when mapping the real memory in
>> rte_eal_hugepage_init(), mmap doesn't return the same address as the
>> requested address. When taking a look at the /proc/<pid>/maps, the
>> requested address range is free for use. With this bug, pre-allocate some free space doesn't work.
> Hi Chao,
>
> If I understand you correctly, the issue you are describing would cause DPDK to fail initialization even with the reverse reordering that you are doing for PPC.
>
> Basically (just showing relevant initialization steps):
> 1. map_all_hugepages(..., orig=1)
>       - map all hugepages
> 2. find physical address for each hugepage 3. sort by physical address 4. map_all_hugepages(..., orig=0)
>       - Now we try to get big chunk of virtual address for a block of contig hugepages
>          so we know we have that virtual address chunk available.
>       - Then we try to remap each page of that block of contig pages into that
>          virtual address chunk.
>
> So the issue you are describing would make step 4 fail regardless of the different ordering that PPC does.
> I'm probably missing something, would you care to elaborate?
>
> Sergio
>
>
>> We're trying to create some test case and report it as a bug to kernel
>> community.
>>
>> Here's some logs:
>> ===============================
>> EAL: Ask a virtual area of 0x10000000 bytes
>> EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
>> EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000
>> requested
>> addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000
>> requested
>> addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000
>> requested
>> addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000
>> requested
>> addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000
>> requested
>> addr: 0x3fffab000000  mmaped addr: 0x3effec000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000
>> requested
>> addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000
>> requested
>> addr: 0x3fffad000000  mmaped addr: 0x3effea000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000
>> requested
>> addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000
>> requested
>> addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000
>> requested
>> addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000
>> requested
>> addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000
>> requested
>> addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000
>> requested
>> addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000
>> requested
>> addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000
>> requested
>> addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
>> EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000
>> requested
>> addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000
>>
>> # cat /proc/143765/maps
>> 01000000-02000000 rw-s 00000000 00:27 61162550
>> /mnt/huge/rtemap_14
>> 02000000-03000000 rw-s 00000000 00:27 61162536
>> /mnt/huge/rtemap_0
>> 03000000-04000000 rw-s 00000000 00:27 61162537
>> /mnt/huge/rtemap_1
>> 04000000-05000000 rw-s 00000000 00:27 61162538
>> /mnt/huge/rtemap_2
>> 05000000-06000000 rw-s 00000000 00:27 61162539
>> /mnt/huge/rtemap_3
>> 06000000-07000000 rw-s 00000000 00:27 61162540
>> /mnt/huge/rtemap_4
>> 07000000-08000000 rw-s 00000000 00:27 61162541
>> /mnt/huge/rtemap_5
>> 08000000-09000000 rw-s 00000000 00:27 61162542
>> /mnt/huge/rtemap_6
>> 09000000-0a000000 rw-s 00000000 00:27 61162543
>> /mnt/huge/rtemap_7
>> 0a000000-0b000000 rw-s 00000000 00:27 61162544
>> /mnt/huge/rtemap_8
>> 0b000000-0c000000 rw-s 00000000 00:27 61162545
>> /mnt/huge/rtemap_9
>> 0c000000-0d000000 rw-s 00000000 00:27 61162546
>> /mnt/huge/rtemap_10
>> 0d000000-0e000000 rw-s 00000000 00:27 61162547
>> /mnt/huge/rtemap_11
>> 0e000000-0f000000 rw-s 00000000 00:27 61162548
>> /mnt/huge/rtemap_12
>> 0f000000-10000000 rw-s 00000000 00:27 61162549
>> /mnt/huge/rtemap_13
>> 10000000-101f0000 r-xp 00000000 08:32 6040458
>> /home/dpdk/build/app/test
>> 101f0000-10220000 rw-p 001f0000 08:32 6040458
>> /home/dpdk/build/app/test
>> 10220000-15c20000 rw-p 00000000 00:00 0 [heap]
>> 20000000-21000000 rw-s 00000000 00:27 61162566
>> /mnt/huge/rtemap_30
>> 21000000-22000000 rw-s 00000000 00:27 61162567
>> /mnt/huge/rtemap_31
>> 22000000-23000000 rw-s 00000000 00:27 61162568
>> /mnt/huge/rtemap_32
>> 23000000-24000000 rw-s 00000000 00:27 61162569
>> /mnt/huge/rtemap_33
>> 24000000-25000000 rw-s 00000000 00:27 61162570
>> /mnt/huge/rtemap_34
>> 25000000-26000000 rw-s 00000000 00:27 61162571
>> /mnt/huge/rtemap_35
>> 26000000-27000000 rw-s 00000000 00:27 61162572
>> /mnt/huge/rtemap_36
>> 27000000-28000000 rw-s 00000000 00:27 61162573
>> /mnt/huge/rtemap_37
>> 28000000-29000000 rw-s 00000000 00:27 61162574
>> /mnt/huge/rtemap_38
>> 29000000-2a000000 rw-s 00000000 00:27 61162575
>> /mnt/huge/rtemap_39
>> 2a000000-2b000000 rw-s 00000000 00:27 61162576
>> /mnt/huge/rtemap_40
>> 2b000000-2c000000 rw-s 00000000 00:27 61162577
>> /mnt/huge/rtemap_41
>> 2c000000-2d000000 rw-s 00000000 00:27 61162578
>> /mnt/huge/rtemap_42
>> 2d000000-2e000000 rw-s 00000000 00:27 61162579
>> /mnt/huge/rtemap_43
>> 2e000000-2f000000 rw-s 00000000 00:27 61162580
>> /mnt/huge/rtemap_44
>> 2f000000-30000000 rw-s 00000000 00:27 61162581
>> /mnt/huge/rtemap_45
>> 30000000-31000000 rw-s 00000000 00:27 61162582
>> /mnt/huge/rtemap_46
>> 31000000-32000000 rw-s 00000000 00:27 61162583
>> /mnt/huge/rtemap_47
>> 32000000-33000000 rw-s 00000000 00:27 61162584
>> /mnt/huge/rtemap_48
>> 33000000-34000000 rw-s 00000000 00:27 61162585
>> /mnt/huge/rtemap_49
>> 34000000-35000000 rw-s 00000000 00:27 61162586
>> /mnt/huge/rtemap_50
>> 35000000-36000000 rw-s 00000000 00:27 61162587
>> /mnt/huge/rtemap_51
>> 36000000-37000000 rw-s 00000000 00:27 61162588
>> /mnt/huge/rtemap_52
>> 37000000-38000000 rw-s 00000000 00:27 61162589
>> /mnt/huge/rtemap_53
>> 38000000-39000000 rw-s 00000000 00:27 61162590
>> /mnt/huge/rtemap_54
>> 39000000-3a000000 rw-s 00000000 00:27 61162591
>> /mnt/huge/rtemap_55
>> 3a000000-3b000000 rw-s 00000000 00:27 61162592
>> /mnt/huge/rtemap_56
>> 3b000000-3c000000 rw-s 00000000 00:27 61162593
>> /mnt/huge/rtemap_57
>> 3c000000-3d000000 rw-s 00000000 00:27 61162594
>> /mnt/huge/rtemap_58
>> 3d000000-3e000000 rw-s 00000000 00:27 61162595
>> /mnt/huge/rtemap_59
>> 3e000000-3f000000 rw-s 00000000 00:27 61162596
>> /mnt/huge/rtemap_60
>> 3f000000-40000000 rw-s 00000000 00:27 61162597
>> /mnt/huge/rtemap_61
>> 40000000-41000000 rw-s 00000000 00:27 61162598
>> /mnt/huge/rtemap_62
>> 41000000-42000000 rw-s 00000000 00:27 61162599
>> /mnt/huge/rtemap_63
>> 3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
>> /mnt/huge/rtemap_5
>> 3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
>> /mnt/huge/rtemap_4
>> 3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
>> /mnt/huge/rtemap_15
>> 3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
>> /mnt/huge/rtemap_2
>> 3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
>> /mnt/huge/rtemap_13
>> 3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
>> /mnt/huge/rtemap_8
>> 3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
>> /mnt/huge/rtemap_7
>> 3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
>> /mnt/huge/rtemap_12
>> 3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
>> /mnt/huge/rtemap_1
>> 3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
>> /mnt/huge/rtemap_14
>> 3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
>> /mnt/huge/rtemap_9
>> 3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
>> /mnt/huge/rtemap_10
>> 3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
>> /mnt/huge/rtemap_11
>> 3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
>> /mnt/huge/rtemap_3
>> 3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
>> /mnt/huge/rtemap_6
>> 3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
>> /mnt/huge/rtemap_0
>> 3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
>> /mnt/huge/rtemap_20
>> 3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
>> /mnt/huge/rtemap_16
>> 3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
>> /mnt/huge/rtemap_17
>> 3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
>> /mnt/huge/rtemap_18
>> 3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
>> /mnt/huge/rtemap_19
>> 3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
>> /mnt/huge/rtemap_31
>> 3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
>> /mnt/huge/rtemap_30
>> 3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
>> /mnt/huge/rtemap_22
>> 3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
>> /mnt/huge/rtemap_21
>> 3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
>> /mnt/huge/rtemap_24
>> 3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
>> /mnt/huge/rtemap_25
>> 3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
>> /mnt/huge/rtemap_28
>> 3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
>> /mnt/huge/rtemap_23
>> 3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
>> /mnt/huge/rtemap_27
>> 3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
>> /mnt/huge/rtemap_26
>> 3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
>> /mnt/huge/rtemap_29
>> 3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
>> /mnt/huge/rtemap_36
>> 3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
>> /mnt/huge/rtemap_32
>> 3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
>> /mnt/huge/rtemap_33
>> 3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
>> /mnt/huge/rtemap_34
>> 3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
>> /mnt/huge/rtemap_35
>> 3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
>> /mnt/huge/rtemap_47
>> 3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
>> /mnt/huge/rtemap_46
>> 3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
>> /mnt/huge/rtemap_45
>> 3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
>> /mnt/huge/rtemap_44
>> 3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
>> /mnt/huge/rtemap_43
>> 3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
>> /mnt/huge/rtemap_42
>> 3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
>> /mnt/huge/rtemap_41
>> 3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
>> /mnt/huge/rtemap_38
>> 3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
>> /mnt/huge/rtemap_37
>> 3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
>> /mnt/huge/rtemap_39
>> 3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
>> /mnt/huge/rtemap_40
>> 3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
>> /mnt/huge/rtemap_48
>> 3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
>> /mnt/huge/rtemap_49
>> 3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
>> /mnt/huge/rtemap_50
>> 3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
>> /mnt/huge/rtemap_51
>> 3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
>> /mnt/huge/rtemap_63
>> 3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
>> /mnt/huge/rtemap_62
>> 3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
>> /mnt/huge/rtemap_61
>> 3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
>> /mnt/huge/rtemap_60
>> 3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
>> /mnt/huge/rtemap_59
>> 3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
>> /mnt/huge/rtemap_58
>> 3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
>> /mnt/huge/rtemap_57
>> 3effec000000-3effed000000 rw-s 00000000 00:27 61162592
>> /mnt/huge/rtemap_56
>> 3effed000000-3effee000000 rw-s 00000000 00:27 61162591
>> /mnt/huge/rtemap_55
>> 3effee000000-3effef000000 rw-s 00000000 00:27 61162590
>> /mnt/huge/rtemap_54
>> 3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
>> /mnt/huge/rtemap_53
>> 3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
>> /mnt/huge/rtemap_52
>> 3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
>> /mnt/huge/rtemap_29
>> 3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
>> /mnt/huge/rtemap_28
>> 3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
>> /mnt/huge/rtemap_27
>> 3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
>> /mnt/huge/rtemap_26
>> 3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
>> /mnt/huge/rtemap_25
>> 3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
>> /mnt/huge/rtemap_24
>> 3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
>> /mnt/huge/rtemap_23
>> 3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
>> /mnt/huge/rtemap_22
>> 3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
>> /mnt/huge/rtemap_21
>> 3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
>> /mnt/huge/rtemap_20
>> 3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
>> /mnt/huge/rtemap_19
>> 3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
>> /mnt/huge/rtemap_18
>> 3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
>> /mnt/huge/rtemap_17
>> 3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
>> /mnt/huge/rtemap_16
>> 3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
>> /mnt/huge/rtemap_15
>> 3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0
>> 3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240 /run/.rte_config
>> 3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0
>> 3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531
>> /opt/at7.1/lib64/power8/libc-2.19.so
>> 3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531
>> /opt/at7.1/lib64/power8/libc-2.19.so
>> 3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0
>> 3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563
>> /opt/at7.1/lib64/power8/libpthread-2.19.so
>> 3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563
>> /opt/at7.1/lib64/power8/libpthread-2.19.so
>> 3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210
>> /opt/at7.1/lib64/libdl-2.19.so
>> 3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210
>> /opt/at7.1/lib64/libdl-2.19.so
>> 3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
>> /opt/at7.1/lib64/power8/libz.so.1.2.6
>> 3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
>> /opt/at7.1/lib64/power8/libz.so.1.2.6
>> 3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568
>> /opt/at7.1/lib64/power8/libm-2.19.so
>> 3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568
>> /opt/at7.1/lib64/power8/libm-2.19.so
>> 3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0 [vdso]
>> 3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048
>> /opt/at7.1/lib64/ld-2.19.so
>> 3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048
>> /opt/at7.1/lib64/ld-2.19.so
>> 3ffffffd0000-400000000000 rw-p 00000000 00:00 0 [stack]
>>
>>
>> -----Original Message-----
>> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
>> Sent: 2016年3月23日 1:11
>> To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
>> Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org;
>> chaozhu@linux.vnet.ibm.com; David Marchand <david.marchand@6wind.com>
>> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map
>> hugepages in correct order
>>
>> On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
>>> First of all, forgive my ignorance regarding ppc64 and if the
>>> questions are naive but after having a look to the already existing
>>> code for ppc64 and this patch now, why are we doing this reverse
>>> mapping at all?
>>>
>>> I guess the question revolves around the comment in eal_memory.c:
>>> 1316                 /* On PPC64 architecture, the mmap always start from
>>> higher
>>> 1317                  * virtual address to lower address. Here, both the
>>> physical
>>> 1318                  * address and virtual address are in descending
>> order
>>> */
>>>
>>>   From looking at the code, for ppc64 we do qsort in reverse order and
>>> thereafter everything looks to be is done to account for that reverse
>>> sorting.
>>>
>>> CC: Chao Zhu and David Marchand as original author and reviewer of
>>> the
>> code.
>>> Sergio
>>>
>> Just to add my 2c here. At one point, with I believe some i686
>> installs - don't remember the specific OS/kernel, we found that the
>> mmap calls were returning the highest free address first and then
>> working downwards - must like seems to be described here. To fix this
>> we changed the mmap code from assuming that addresses are mapped
>> upwards, to instead explicitly requesting a large free block of memory
>> (mmap of /dev/zero) to find a free address space range of the correct
>> size, and then explicitly mmapping each individual page to the
>> appropriate place in that free range. With this scheme it didn't
>> matter whether the OS tried to mmap the pages from the highest or
>> lowest address because we always told the OS where to put the page (and we knew the slot was free from the earlier block mmap).
>> Would this scheme not also work for PPC in a similar way? (Again,
>> forgive unfamiliarity with PPC! :-) )
>>
>> /Bruce
>>
>>> On 07/03/2016 14:13, Gowrishankar wrote:
>>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
>>>>
>>>> For a secondary process address space to map hugepages from every
>>>> segment of primary process, hugepage_file entries has to be mapped
>>>> reversely from the list that primary process updated for every
>>>> segment. This is for a reason that, in ppc64, hugepages are sorted
>>>> for
>> decrementing addresses.
>>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
>>>> ---
>
  
Thomas Monjalon Feb. 15, 2017, 8:51 a.m. UTC | #10
There was no follow-up on this discussion.
Please, what is the conclusion?


2016-05-20 11:25, Sergio Gonzalez Monroy:
> On 20/05/2016 09:41, Chao Zhu wrote:
> > Sergio,
> >
> > The step 4 will not fail because each huge page will get an virtual address finally, though it's a different address. If you take a look at the function rte_eal_hugepage_init(), in the last loop, it uses both physical address and virtual address to determine a new memory segment. This step can make sure that the initialization is correct. What I want to say is, this bug also influence the secondary process in function rte_eal_hugepage_attach(). It can make the secondary process fail to init. I'm trying to figure out how to make it work.
> 
> You are right, I misread the code.
> 
> So basically because mmap ignores the hint to mmap on the requested address,
> by default we get VA maps in decreasing address order.
> 
> Knowing that, PPC orders pages by decreasing physical address order so when
> this happens we actually get hugepages in order in the "new" final_va.
> 
> Not sure if that makes sense but I think I understand where you are 
> coming from.
> 
> I think we need to document this as know issue and/or add comments regarding
> this behavior , basically calling out that all this "reverse-ordering" 
> is required
> because mmap fails to map on the requested VA.
> 
> Thanks,
> Sergio
> 
> > -----Original Message-----
> > From: Sergio Gonzalez Monroy [mailto:sergio.gonzalez.monroy@intel.com]
> > Sent: 2016年5月20日 16:01
> > To: Chao Zhu <chaozhu@linux.vnet.ibm.com>; 'Bruce Richardson' <bruce.richardson@intel.com>
> > Cc: 'Gowrishankar' <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org; 'David Marchand' <david.marchand@6wind.com>
> > Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map hugepages in correct order
> >
> > On 20/05/2016 04:03, Chao Zhu wrote:
> >> Bruce,
> >>
> >> Recently, we find some bugs with mmap in PowerLinux. The mmap doesn't
> >> respect the address hints. In function get_virtual_area() in
> >> eal_memory.c, mmap get the free virtual address range as the address
> >> hint. However, when mapping the real memory in
> >> rte_eal_hugepage_init(), mmap doesn't return the same address as the
> >> requested address. When taking a look at the /proc/<pid>/maps, the
> >> requested address range is free for use. With this bug, pre-allocate some free space doesn't work.
> > Hi Chao,
> >
> > If I understand you correctly, the issue you are describing would cause DPDK to fail initialization even with the reverse reordering that you are doing for PPC.
> >
> > Basically (just showing relevant initialization steps):
> > 1. map_all_hugepages(..., orig=1)
> >       - map all hugepages
> > 2. find physical address for each hugepage 3. sort by physical address 4. map_all_hugepages(..., orig=0)
> >       - Now we try to get big chunk of virtual address for a block of contig hugepages
> >          so we know we have that virtual address chunk available.
> >       - Then we try to remap each page of that block of contig pages into that
> >          virtual address chunk.
> >
> > So the issue you are describing would make step 4 fail regardless of the different ordering that PPC does.
> > I'm probably missing something, would you care to elaborate?
> >
> > Sergio
> >
> >
> >> We're trying to create some test case and report it as a bug to kernel
> >> community.
> >>
> >> Here's some logs:
> >> ===============================
> >> EAL: Ask a virtual area of 0x10000000 bytes
> >> EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000
> >> requested
> >> addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000
> >> requested
> >> addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000
> >> requested
> >> addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000
> >> requested
> >> addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000
> >> requested
> >> addr: 0x3fffab000000  mmaped addr: 0x3effec000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000
> >> requested
> >> addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000
> >> requested
> >> addr: 0x3fffad000000  mmaped addr: 0x3effea000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000
> >> requested
> >> addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000
> >> requested
> >> addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000
> >> requested
> >> addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000
> >> requested
> >> addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000
> >> requested
> >> addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000
> >> requested
> >> addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000
> >> requested
> >> addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000
> >> requested
> >> addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000
> >> requested
> >> addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000
> >>
> >> # cat /proc/143765/maps
> >> 01000000-02000000 rw-s 00000000 00:27 61162550
> >> /mnt/huge/rtemap_14
> >> 02000000-03000000 rw-s 00000000 00:27 61162536
> >> /mnt/huge/rtemap_0
> >> 03000000-04000000 rw-s 00000000 00:27 61162537
> >> /mnt/huge/rtemap_1
> >> 04000000-05000000 rw-s 00000000 00:27 61162538
> >> /mnt/huge/rtemap_2
> >> 05000000-06000000 rw-s 00000000 00:27 61162539
> >> /mnt/huge/rtemap_3
> >> 06000000-07000000 rw-s 00000000 00:27 61162540
> >> /mnt/huge/rtemap_4
> >> 07000000-08000000 rw-s 00000000 00:27 61162541
> >> /mnt/huge/rtemap_5
> >> 08000000-09000000 rw-s 00000000 00:27 61162542
> >> /mnt/huge/rtemap_6
> >> 09000000-0a000000 rw-s 00000000 00:27 61162543
> >> /mnt/huge/rtemap_7
> >> 0a000000-0b000000 rw-s 00000000 00:27 61162544
> >> /mnt/huge/rtemap_8
> >> 0b000000-0c000000 rw-s 00000000 00:27 61162545
> >> /mnt/huge/rtemap_9
> >> 0c000000-0d000000 rw-s 00000000 00:27 61162546
> >> /mnt/huge/rtemap_10
> >> 0d000000-0e000000 rw-s 00000000 00:27 61162547
> >> /mnt/huge/rtemap_11
> >> 0e000000-0f000000 rw-s 00000000 00:27 61162548
> >> /mnt/huge/rtemap_12
> >> 0f000000-10000000 rw-s 00000000 00:27 61162549
> >> /mnt/huge/rtemap_13
> >> 10000000-101f0000 r-xp 00000000 08:32 6040458
> >> /home/dpdk/build/app/test
> >> 101f0000-10220000 rw-p 001f0000 08:32 6040458
> >> /home/dpdk/build/app/test
> >> 10220000-15c20000 rw-p 00000000 00:00 0 [heap]
> >> 20000000-21000000 rw-s 00000000 00:27 61162566
> >> /mnt/huge/rtemap_30
> >> 21000000-22000000 rw-s 00000000 00:27 61162567
> >> /mnt/huge/rtemap_31
> >> 22000000-23000000 rw-s 00000000 00:27 61162568
> >> /mnt/huge/rtemap_32
> >> 23000000-24000000 rw-s 00000000 00:27 61162569
> >> /mnt/huge/rtemap_33
> >> 24000000-25000000 rw-s 00000000 00:27 61162570
> >> /mnt/huge/rtemap_34
> >> 25000000-26000000 rw-s 00000000 00:27 61162571
> >> /mnt/huge/rtemap_35
> >> 26000000-27000000 rw-s 00000000 00:27 61162572
> >> /mnt/huge/rtemap_36
> >> 27000000-28000000 rw-s 00000000 00:27 61162573
> >> /mnt/huge/rtemap_37
> >> 28000000-29000000 rw-s 00000000 00:27 61162574
> >> /mnt/huge/rtemap_38
> >> 29000000-2a000000 rw-s 00000000 00:27 61162575
> >> /mnt/huge/rtemap_39
> >> 2a000000-2b000000 rw-s 00000000 00:27 61162576
> >> /mnt/huge/rtemap_40
> >> 2b000000-2c000000 rw-s 00000000 00:27 61162577
> >> /mnt/huge/rtemap_41
> >> 2c000000-2d000000 rw-s 00000000 00:27 61162578
> >> /mnt/huge/rtemap_42
> >> 2d000000-2e000000 rw-s 00000000 00:27 61162579
> >> /mnt/huge/rtemap_43
> >> 2e000000-2f000000 rw-s 00000000 00:27 61162580
> >> /mnt/huge/rtemap_44
> >> 2f000000-30000000 rw-s 00000000 00:27 61162581
> >> /mnt/huge/rtemap_45
> >> 30000000-31000000 rw-s 00000000 00:27 61162582
> >> /mnt/huge/rtemap_46
> >> 31000000-32000000 rw-s 00000000 00:27 61162583
> >> /mnt/huge/rtemap_47
> >> 32000000-33000000 rw-s 00000000 00:27 61162584
> >> /mnt/huge/rtemap_48
> >> 33000000-34000000 rw-s 00000000 00:27 61162585
> >> /mnt/huge/rtemap_49
> >> 34000000-35000000 rw-s 00000000 00:27 61162586
> >> /mnt/huge/rtemap_50
> >> 35000000-36000000 rw-s 00000000 00:27 61162587
> >> /mnt/huge/rtemap_51
> >> 36000000-37000000 rw-s 00000000 00:27 61162588
> >> /mnt/huge/rtemap_52
> >> 37000000-38000000 rw-s 00000000 00:27 61162589
> >> /mnt/huge/rtemap_53
> >> 38000000-39000000 rw-s 00000000 00:27 61162590
> >> /mnt/huge/rtemap_54
> >> 39000000-3a000000 rw-s 00000000 00:27 61162591
> >> /mnt/huge/rtemap_55
> >> 3a000000-3b000000 rw-s 00000000 00:27 61162592
> >> /mnt/huge/rtemap_56
> >> 3b000000-3c000000 rw-s 00000000 00:27 61162593
> >> /mnt/huge/rtemap_57
> >> 3c000000-3d000000 rw-s 00000000 00:27 61162594
> >> /mnt/huge/rtemap_58
> >> 3d000000-3e000000 rw-s 00000000 00:27 61162595
> >> /mnt/huge/rtemap_59
> >> 3e000000-3f000000 rw-s 00000000 00:27 61162596
> >> /mnt/huge/rtemap_60
> >> 3f000000-40000000 rw-s 00000000 00:27 61162597
> >> /mnt/huge/rtemap_61
> >> 40000000-41000000 rw-s 00000000 00:27 61162598
> >> /mnt/huge/rtemap_62
> >> 41000000-42000000 rw-s 00000000 00:27 61162599
> >> /mnt/huge/rtemap_63
> >> 3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
> >> /mnt/huge/rtemap_5
> >> 3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
> >> /mnt/huge/rtemap_4
> >> 3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
> >> /mnt/huge/rtemap_15
> >> 3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
> >> /mnt/huge/rtemap_2
> >> 3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
> >> /mnt/huge/rtemap_13
> >> 3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
> >> /mnt/huge/rtemap_8
> >> 3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
> >> /mnt/huge/rtemap_7
> >> 3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
> >> /mnt/huge/rtemap_12
> >> 3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
> >> /mnt/huge/rtemap_1
> >> 3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
> >> /mnt/huge/rtemap_14
> >> 3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
> >> /mnt/huge/rtemap_9
> >> 3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
> >> /mnt/huge/rtemap_10
> >> 3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
> >> /mnt/huge/rtemap_11
> >> 3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
> >> /mnt/huge/rtemap_3
> >> 3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
> >> /mnt/huge/rtemap_6
> >> 3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
> >> /mnt/huge/rtemap_0
> >> 3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
> >> /mnt/huge/rtemap_20
> >> 3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
> >> /mnt/huge/rtemap_16
> >> 3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
> >> /mnt/huge/rtemap_17
> >> 3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
> >> /mnt/huge/rtemap_18
> >> 3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
> >> /mnt/huge/rtemap_19
> >> 3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
> >> /mnt/huge/rtemap_31
> >> 3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
> >> /mnt/huge/rtemap_30
> >> 3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
> >> /mnt/huge/rtemap_22
> >> 3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
> >> /mnt/huge/rtemap_21
> >> 3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
> >> /mnt/huge/rtemap_24
> >> 3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
> >> /mnt/huge/rtemap_25
> >> 3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
> >> /mnt/huge/rtemap_28
> >> 3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
> >> /mnt/huge/rtemap_23
> >> 3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
> >> /mnt/huge/rtemap_27
> >> 3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
> >> /mnt/huge/rtemap_26
> >> 3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
> >> /mnt/huge/rtemap_29
> >> 3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
> >> /mnt/huge/rtemap_36
> >> 3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
> >> /mnt/huge/rtemap_32
> >> 3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
> >> /mnt/huge/rtemap_33
> >> 3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
> >> /mnt/huge/rtemap_34
> >> 3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
> >> /mnt/huge/rtemap_35
> >> 3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
> >> /mnt/huge/rtemap_47
> >> 3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
> >> /mnt/huge/rtemap_46
> >> 3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
> >> /mnt/huge/rtemap_45
> >> 3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
> >> /mnt/huge/rtemap_44
> >> 3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
> >> /mnt/huge/rtemap_43
> >> 3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
> >> /mnt/huge/rtemap_42
> >> 3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
> >> /mnt/huge/rtemap_41
> >> 3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
> >> /mnt/huge/rtemap_38
> >> 3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
> >> /mnt/huge/rtemap_37
> >> 3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
> >> /mnt/huge/rtemap_39
> >> 3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
> >> /mnt/huge/rtemap_40
> >> 3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
> >> /mnt/huge/rtemap_48
> >> 3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
> >> /mnt/huge/rtemap_49
> >> 3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
> >> /mnt/huge/rtemap_50
> >> 3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
> >> /mnt/huge/rtemap_51
> >> 3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
> >> /mnt/huge/rtemap_63
> >> 3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
> >> /mnt/huge/rtemap_62
> >> 3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
> >> /mnt/huge/rtemap_61
> >> 3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
> >> /mnt/huge/rtemap_60
> >> 3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
> >> /mnt/huge/rtemap_59
> >> 3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
> >> /mnt/huge/rtemap_58
> >> 3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
> >> /mnt/huge/rtemap_57
> >> 3effec000000-3effed000000 rw-s 00000000 00:27 61162592
> >> /mnt/huge/rtemap_56
> >> 3effed000000-3effee000000 rw-s 00000000 00:27 61162591
> >> /mnt/huge/rtemap_55
> >> 3effee000000-3effef000000 rw-s 00000000 00:27 61162590
> >> /mnt/huge/rtemap_54
> >> 3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
> >> /mnt/huge/rtemap_53
> >> 3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
> >> /mnt/huge/rtemap_52
> >> 3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
> >> /mnt/huge/rtemap_29
> >> 3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
> >> /mnt/huge/rtemap_28
> >> 3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
> >> /mnt/huge/rtemap_27
> >> 3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
> >> /mnt/huge/rtemap_26
> >> 3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
> >> /mnt/huge/rtemap_25
> >> 3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
> >> /mnt/huge/rtemap_24
> >> 3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
> >> /mnt/huge/rtemap_23
> >> 3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
> >> /mnt/huge/rtemap_22
> >> 3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
> >> /mnt/huge/rtemap_21
> >> 3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
> >> /mnt/huge/rtemap_20
> >> 3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
> >> /mnt/huge/rtemap_19
> >> 3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
> >> /mnt/huge/rtemap_18
> >> 3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
> >> /mnt/huge/rtemap_17
> >> 3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
> >> /mnt/huge/rtemap_16
> >> 3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
> >> /mnt/huge/rtemap_15
> >> 3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0
> >> 3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240 /run/.rte_config
> >> 3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0
> >> 3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531
> >> /opt/at7.1/lib64/power8/libc-2.19.so
> >> 3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531
> >> /opt/at7.1/lib64/power8/libc-2.19.so
> >> 3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0
> >> 3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563
> >> /opt/at7.1/lib64/power8/libpthread-2.19.so
> >> 3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563
> >> /opt/at7.1/lib64/power8/libpthread-2.19.so
> >> 3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210
> >> /opt/at7.1/lib64/libdl-2.19.so
> >> 3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210
> >> /opt/at7.1/lib64/libdl-2.19.so
> >> 3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
> >> /opt/at7.1/lib64/power8/libz.so.1.2.6
> >> 3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
> >> /opt/at7.1/lib64/power8/libz.so.1.2.6
> >> 3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568
> >> /opt/at7.1/lib64/power8/libm-2.19.so
> >> 3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568
> >> /opt/at7.1/lib64/power8/libm-2.19.so
> >> 3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0 [vdso]
> >> 3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048
> >> /opt/at7.1/lib64/ld-2.19.so
> >> 3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048
> >> /opt/at7.1/lib64/ld-2.19.so
> >> 3ffffffd0000-400000000000 rw-p 00000000 00:00 0 [stack]
> >>
> >>
> >> -----Original Message-----
> >> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> >> Sent: 2016年3月23日 1:11
> >> To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
> >> Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org;
> >> chaozhu@linux.vnet.ibm.com; David Marchand <david.marchand@6wind.com>
> >> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map
> >> hugepages in correct order
> >>
> >> On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
> >>> First of all, forgive my ignorance regarding ppc64 and if the
> >>> questions are naive but after having a look to the already existing
> >>> code for ppc64 and this patch now, why are we doing this reverse
> >>> mapping at all?
> >>>
> >>> I guess the question revolves around the comment in eal_memory.c:
> >>> 1316                 /* On PPC64 architecture, the mmap always start from
> >>> higher
> >>> 1317                  * virtual address to lower address. Here, both the
> >>> physical
> >>> 1318                  * address and virtual address are in descending
> >> order
> >>> */
> >>>
> >>>   From looking at the code, for ppc64 we do qsort in reverse order and
> >>> thereafter everything looks to be is done to account for that reverse
> >>> sorting.
> >>>
> >>> CC: Chao Zhu and David Marchand as original author and reviewer of
> >>> the
> >> code.
> >>> Sergio
> >>>
> >> Just to add my 2c here. At one point, with I believe some i686
> >> installs - don't remember the specific OS/kernel, we found that the
> >> mmap calls were returning the highest free address first and then
> >> working downwards - must like seems to be described here. To fix this
> >> we changed the mmap code from assuming that addresses are mapped
> >> upwards, to instead explicitly requesting a large free block of memory
> >> (mmap of /dev/zero) to find a free address space range of the correct
> >> size, and then explicitly mmapping each individual page to the
> >> appropriate place in that free range. With this scheme it didn't
> >> matter whether the OS tried to mmap the pages from the highest or
> >> lowest address because we always told the OS where to put the page (and we knew the slot was free from the earlier block mmap).
> >> Would this scheme not also work for PPC in a similar way? (Again,
> >> forgive unfamiliarity with PPC! :-) )
> >>
> >> /Bruce
> >>
> >>> On 07/03/2016 14:13, Gowrishankar wrote:
> >>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
> >>>>
> >>>> For a secondary process address space to map hugepages from every
> >>>> segment of primary process, hugepage_file entries has to be mapped
> >>>> reversely from the list that primary process updated for every
> >>>> segment. This is for a reason that, in ppc64, hugepages are sorted
> >>>> for
> >> decrementing addresses.
> >>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> >>>> ---
> >
>
  
Chao Zhu Feb. 16, 2017, 7:22 a.m. UTC | #11
Thomas,

We have several different internal fixes and didn't get a conclusion. Let me summarize them and give a final patch sets.
Thanks for your reminder!

-----Original Message-----
From: Thomas Monjalon [mailto:thomas.monjalon@6wind.com] 
Sent: 2017年2月15日 16:52
To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>; Chao Zhu <chaozhu@linux.vnet.ibm.com>; 'Gowrishankar' <gowrishankar.m@linux.vnet.ibm.com>
Cc: dev@dpdk.org; 'Bruce Richardson' <bruce.richardson@intel.com>; 'David Marchand' <david.marchand@6wind.com>
Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to map hugepages in correct order

There was no follow-up on this discussion.
Please, what is the conclusion?


2016-05-20 11:25, Sergio Gonzalez Monroy:
> On 20/05/2016 09:41, Chao Zhu wrote:
> > Sergio,
> >
> > The step 4 will not fail because each huge page will get an virtual address finally, though it's a different address. If you take a look at the function rte_eal_hugepage_init(), in the last loop, it uses both physical address and virtual address to determine a new memory segment. This step can make sure that the initialization is correct. What I want to say is, this bug also influence the secondary process in function rte_eal_hugepage_attach(). It can make the secondary process fail to init. I'm trying to figure out how to make it work.
> 
> You are right, I misread the code.
> 
> So basically because mmap ignores the hint to mmap on the requested 
> address, by default we get VA maps in decreasing address order.
> 
> Knowing that, PPC orders pages by decreasing physical address order so 
> when this happens we actually get hugepages in order in the "new" final_va.
> 
> Not sure if that makes sense but I think I understand where you are 
> coming from.
> 
> I think we need to document this as know issue and/or add comments 
> regarding this behavior , basically calling out that all this "reverse-ordering"
> is required
> because mmap fails to map on the requested VA.
> 
> Thanks,
> Sergio
> 
> > -----Original Message-----
> > From: Sergio Gonzalez Monroy 
> > [mailto:sergio.gonzalez.monroy@intel.com]
> > Sent: 2016年5月20日 16:01
> > To: Chao Zhu <chaozhu@linux.vnet.ibm.com>; 'Bruce Richardson' 
> > <bruce.richardson@intel.com>
> > Cc: 'Gowrishankar' <gowrishankar.m@linux.vnet.ibm.com>; 
> > dev@dpdk.org; 'David Marchand' <david.marchand@6wind.com>
> > Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to 
> > map hugepages in correct order
> >
> > On 20/05/2016 04:03, Chao Zhu wrote:
> >> Bruce,
> >>
> >> Recently, we find some bugs with mmap in PowerLinux. The mmap 
> >> doesn't respect the address hints. In function get_virtual_area() 
> >> in eal_memory.c, mmap get the free virtual address range as the 
> >> address hint. However, when mapping the real memory in 
> >> rte_eal_hugepage_init(), mmap doesn't return the same address as 
> >> the requested address. When taking a look at the /proc/<pid>/maps, 
> >> the requested address range is free for use. With this bug, pre-allocate some free space doesn't work.
> > Hi Chao,
> >
> > If I understand you correctly, the issue you are describing would cause DPDK to fail initialization even with the reverse reordering that you are doing for PPC.
> >
> > Basically (just showing relevant initialization steps):
> > 1. map_all_hugepages(..., orig=1)
> >       - map all hugepages
> > 2. find physical address for each hugepage 3. sort by physical address 4. map_all_hugepages(..., orig=0)
> >       - Now we try to get big chunk of virtual address for a block of contig hugepages
> >          so we know we have that virtual address chunk available.
> >       - Then we try to remap each page of that block of contig pages into that
> >          virtual address chunk.
> >
> > So the issue you are describing would make step 4 fail regardless of the different ordering that PPC does.
> > I'm probably missing something, would you care to elaborate?
> >
> > Sergio
> >
> >
> >> We're trying to create some test case and report it as a bug to 
> >> kernel community.
> >>
> >> Here's some logs:
> >> ===============================
> >> EAL: Ask a virtual area of 0x10000000 bytes
> >> EAL: Virtual area found at 0x3fffa7000000 (size = 0x10000000)
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_52,paddr 0x3ca6000000 
> >> requested
> >> addr: 0x3fffa7000000  mmaped addr: 0x3efff0000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_53,paddr 0x3ca5000000 
> >> requested
> >> addr: 0x3fffa8000000  mmaped addr: 0x3effef000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_54,paddr 0x3ca4000000 
> >> requested
> >> addr: 0x3fffa9000000  mmaped addr: 0x3effee000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_55,paddr 0x3ca3000000 
> >> requested
> >> addr: 0x3fffaa000000  mmaped addr: 0x3effed000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_56,paddr 0x3ca2000000 
> >> requested
> >> addr: 0x3fffab000000  mmaped addr: 0x3effec000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_57,paddr 0x3ca1000000 
> >> requested
> >> addr: 0x3fffac000000  mmaped addr: 0x3effeb000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_58,paddr 0x3ca0000000 
> >> requested
> >> addr: 0x3fffad000000  mmaped addr: 0x3effea000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_59,paddr 0x3c9f000000 
> >> requested
> >> addr: 0x3fffae000000  mmaped addr: 0x3effe9000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_60,paddr 0x3c9e000000 
> >> requested
> >> addr: 0x3fffaf000000  mmaped addr: 0x3effe8000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_61,paddr 0x3c9d000000 
> >> requested
> >> addr: 0x3fffb0000000  mmaped addr: 0x3effe7000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_62, paddr 0x3c9c000000 
> >> requested
> >> addr:  0x3fffb1000000 mmaped addr:  0x3effe6000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_63, paddr 0x3c9b000000 
> >> requested
> >> addr:  0x3fffb2000000 mmaped addr:  0x3effe5000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_51, paddr 0x3c9a000000 
> >> requested
> >> addr:  0x3fffb3000000 mmaped addr:  0x3effe4000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_50, paddr 0x3c99000000 
> >> requested
> >> addr:  0x3fffb4000000 mmaped addr:  0x3effe3000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_49, paddr 0x3c98000000 
> >> requested
> >> addr:  0x3fffb5000000 mmaped addr:  0x3effe2000000
> >> EAL: map_all_hugepages, /mnt/huge/rtemap_48, paddr 0x3c97000000 
> >> requested
> >> addr:  0x3fffb6000000 mmaped addr:  0x3effe1000000
> >>
> >> # cat /proc/143765/maps
> >> 01000000-02000000 rw-s 00000000 00:27 61162550
> >> /mnt/huge/rtemap_14
> >> 02000000-03000000 rw-s 00000000 00:27 61162536
> >> /mnt/huge/rtemap_0
> >> 03000000-04000000 rw-s 00000000 00:27 61162537
> >> /mnt/huge/rtemap_1
> >> 04000000-05000000 rw-s 00000000 00:27 61162538
> >> /mnt/huge/rtemap_2
> >> 05000000-06000000 rw-s 00000000 00:27 61162539
> >> /mnt/huge/rtemap_3
> >> 06000000-07000000 rw-s 00000000 00:27 61162540
> >> /mnt/huge/rtemap_4
> >> 07000000-08000000 rw-s 00000000 00:27 61162541
> >> /mnt/huge/rtemap_5
> >> 08000000-09000000 rw-s 00000000 00:27 61162542
> >> /mnt/huge/rtemap_6
> >> 09000000-0a000000 rw-s 00000000 00:27 61162543
> >> /mnt/huge/rtemap_7
> >> 0a000000-0b000000 rw-s 00000000 00:27 61162544
> >> /mnt/huge/rtemap_8
> >> 0b000000-0c000000 rw-s 00000000 00:27 61162545
> >> /mnt/huge/rtemap_9
> >> 0c000000-0d000000 rw-s 00000000 00:27 61162546
> >> /mnt/huge/rtemap_10
> >> 0d000000-0e000000 rw-s 00000000 00:27 61162547
> >> /mnt/huge/rtemap_11
> >> 0e000000-0f000000 rw-s 00000000 00:27 61162548
> >> /mnt/huge/rtemap_12
> >> 0f000000-10000000 rw-s 00000000 00:27 61162549
> >> /mnt/huge/rtemap_13
> >> 10000000-101f0000 r-xp 00000000 08:32 6040458 
> >> /home/dpdk/build/app/test
> >> 101f0000-10220000 rw-p 001f0000 08:32 6040458 
> >> /home/dpdk/build/app/test
> >> 10220000-15c20000 rw-p 00000000 00:00 0 [heap]
> >> 20000000-21000000 rw-s 00000000 00:27 61162566
> >> /mnt/huge/rtemap_30
> >> 21000000-22000000 rw-s 00000000 00:27 61162567
> >> /mnt/huge/rtemap_31
> >> 22000000-23000000 rw-s 00000000 00:27 61162568
> >> /mnt/huge/rtemap_32
> >> 23000000-24000000 rw-s 00000000 00:27 61162569
> >> /mnt/huge/rtemap_33
> >> 24000000-25000000 rw-s 00000000 00:27 61162570
> >> /mnt/huge/rtemap_34
> >> 25000000-26000000 rw-s 00000000 00:27 61162571
> >> /mnt/huge/rtemap_35
> >> 26000000-27000000 rw-s 00000000 00:27 61162572
> >> /mnt/huge/rtemap_36
> >> 27000000-28000000 rw-s 00000000 00:27 61162573
> >> /mnt/huge/rtemap_37
> >> 28000000-29000000 rw-s 00000000 00:27 61162574
> >> /mnt/huge/rtemap_38
> >> 29000000-2a000000 rw-s 00000000 00:27 61162575
> >> /mnt/huge/rtemap_39
> >> 2a000000-2b000000 rw-s 00000000 00:27 61162576
> >> /mnt/huge/rtemap_40
> >> 2b000000-2c000000 rw-s 00000000 00:27 61162577
> >> /mnt/huge/rtemap_41
> >> 2c000000-2d000000 rw-s 00000000 00:27 61162578
> >> /mnt/huge/rtemap_42
> >> 2d000000-2e000000 rw-s 00000000 00:27 61162579
> >> /mnt/huge/rtemap_43
> >> 2e000000-2f000000 rw-s 00000000 00:27 61162580
> >> /mnt/huge/rtemap_44
> >> 2f000000-30000000 rw-s 00000000 00:27 61162581
> >> /mnt/huge/rtemap_45
> >> 30000000-31000000 rw-s 00000000 00:27 61162582
> >> /mnt/huge/rtemap_46
> >> 31000000-32000000 rw-s 00000000 00:27 61162583
> >> /mnt/huge/rtemap_47
> >> 32000000-33000000 rw-s 00000000 00:27 61162584
> >> /mnt/huge/rtemap_48
> >> 33000000-34000000 rw-s 00000000 00:27 61162585
> >> /mnt/huge/rtemap_49
> >> 34000000-35000000 rw-s 00000000 00:27 61162586
> >> /mnt/huge/rtemap_50
> >> 35000000-36000000 rw-s 00000000 00:27 61162587
> >> /mnt/huge/rtemap_51
> >> 36000000-37000000 rw-s 00000000 00:27 61162588
> >> /mnt/huge/rtemap_52
> >> 37000000-38000000 rw-s 00000000 00:27 61162589
> >> /mnt/huge/rtemap_53
> >> 38000000-39000000 rw-s 00000000 00:27 61162590
> >> /mnt/huge/rtemap_54
> >> 39000000-3a000000 rw-s 00000000 00:27 61162591
> >> /mnt/huge/rtemap_55
> >> 3a000000-3b000000 rw-s 00000000 00:27 61162592
> >> /mnt/huge/rtemap_56
> >> 3b000000-3c000000 rw-s 00000000 00:27 61162593
> >> /mnt/huge/rtemap_57
> >> 3c000000-3d000000 rw-s 00000000 00:27 61162594
> >> /mnt/huge/rtemap_58
> >> 3d000000-3e000000 rw-s 00000000 00:27 61162595
> >> /mnt/huge/rtemap_59
> >> 3e000000-3f000000 rw-s 00000000 00:27 61162596
> >> /mnt/huge/rtemap_60
> >> 3f000000-40000000 rw-s 00000000 00:27 61162597
> >> /mnt/huge/rtemap_61
> >> 40000000-41000000 rw-s 00000000 00:27 61162598
> >> /mnt/huge/rtemap_62
> >> 41000000-42000000 rw-s 00000000 00:27 61162599
> >> /mnt/huge/rtemap_63
> >> 3effb1000000-3effb2000000 rw-s 00000000 00:27 61162541
> >> /mnt/huge/rtemap_5
> >> 3effb2000000-3effb3000000 rw-s 00000000 00:27 61162540
> >> /mnt/huge/rtemap_4
> >> 3effb3000000-3effb4000000 rw-s 00000000 00:27 61162551
> >> /mnt/huge/rtemap_15
> >> 3effb4000000-3effb5000000 rw-s 00000000 00:27 61162538
> >> /mnt/huge/rtemap_2
> >> 3effb5000000-3effb6000000 rw-s 00000000 00:27 61162549
> >> /mnt/huge/rtemap_13
> >> 3effb6000000-3effb7000000 rw-s 00000000 00:27 61162544
> >> /mnt/huge/rtemap_8
> >> 3effb7000000-3effb8000000 rw-s 00000000 00:27 61162543
> >> /mnt/huge/rtemap_7
> >> 3effb8000000-3effb9000000 rw-s 00000000 00:27 61162548
> >> /mnt/huge/rtemap_12
> >> 3effb9000000-3effba000000 rw-s 00000000 00:27 61162537
> >> /mnt/huge/rtemap_1
> >> 3effba000000-3effbb000000 rw-s 00000000 00:27 61162550
> >> /mnt/huge/rtemap_14
> >> 3effbb000000-3effbc000000 rw-s 00000000 00:27 61162545
> >> /mnt/huge/rtemap_9
> >> 3effbc000000-3effbd000000 rw-s 00000000 00:27 61162546
> >> /mnt/huge/rtemap_10
> >> 3effbd000000-3effbe000000 rw-s 00000000 00:27 61162547
> >> /mnt/huge/rtemap_11
> >> 3effbe000000-3effbf000000 rw-s 00000000 00:27 61162539
> >> /mnt/huge/rtemap_3
> >> 3effbf000000-3effc0000000 rw-s 00000000 00:27 61162542
> >> /mnt/huge/rtemap_6
> >> 3effc0000000-3effc1000000 rw-s 00000000 00:27 61162536
> >> /mnt/huge/rtemap_0
> >> 3effc1000000-3effc2000000 rw-s 00000000 00:27 61162556
> >> /mnt/huge/rtemap_20
> >> 3effc2000000-3effc3000000 rw-s 00000000 00:27 61162552
> >> /mnt/huge/rtemap_16
> >> 3effc3000000-3effc4000000 rw-s 00000000 00:27 61162553
> >> /mnt/huge/rtemap_17
> >> 3effc4000000-3effc5000000 rw-s 00000000 00:27 61162554
> >> /mnt/huge/rtemap_18
> >> 3effc5000000-3effc6000000 rw-s 00000000 00:27 61162555
> >> /mnt/huge/rtemap_19
> >> 3effc6000000-3effc7000000 rw-s 00000000 00:27 61162567
> >> /mnt/huge/rtemap_31
> >> 3effc7000000-3effc8000000 rw-s 00000000 00:27 61162566
> >> /mnt/huge/rtemap_30
> >> 3effc8000000-3effc9000000 rw-s 00000000 00:27 61162558
> >> /mnt/huge/rtemap_22
> >> 3effc9000000-3effca000000 rw-s 00000000 00:27 61162557
> >> /mnt/huge/rtemap_21
> >> 3effca000000-3effcb000000 rw-s 00000000 00:27 61162560
> >> /mnt/huge/rtemap_24
> >> 3effcb000000-3effcc000000 rw-s 00000000 00:27 61162561
> >> /mnt/huge/rtemap_25
> >> 3effcc000000-3effcd000000 rw-s 00000000 00:27 61162564
> >> /mnt/huge/rtemap_28
> >> 3effcd000000-3effce000000 rw-s 00000000 00:27 61162559
> >> /mnt/huge/rtemap_23
> >> 3effce000000-3effcf000000 rw-s 00000000 00:27 61162563
> >> /mnt/huge/rtemap_27
> >> 3effcf000000-3effd0000000 rw-s 00000000 00:27 61162562
> >> /mnt/huge/rtemap_26
> >> 3effd0000000-3effd1000000 rw-s 00000000 00:27 61162565
> >> /mnt/huge/rtemap_29
> >> 3effd1000000-3effd2000000 rw-s 00000000 00:27 61162572
> >> /mnt/huge/rtemap_36
> >> 3effd2000000-3effd3000000 rw-s 00000000 00:27 61162568
> >> /mnt/huge/rtemap_32
> >> 3effd3000000-3effd4000000 rw-s 00000000 00:27 61162569
> >> /mnt/huge/rtemap_33
> >> 3effd4000000-3effd5000000 rw-s 00000000 00:27 61162570
> >> /mnt/huge/rtemap_34
> >> 3effd5000000-3effd6000000 rw-s 00000000 00:27 61162571
> >> /mnt/huge/rtemap_35
> >> 3effd6000000-3effd7000000 rw-s 00000000 00:27 61162583
> >> /mnt/huge/rtemap_47
> >> 3effd7000000-3effd8000000 rw-s 00000000 00:27 61162582
> >> /mnt/huge/rtemap_46
> >> 3effd8000000-3effd9000000 rw-s 00000000 00:27 61162581
> >> /mnt/huge/rtemap_45
> >> 3effd9000000-3effda000000 rw-s 00000000 00:27 61162580
> >> /mnt/huge/rtemap_44
> >> 3effda000000-3effdb000000 rw-s 00000000 00:27 61162579
> >> /mnt/huge/rtemap_43
> >> 3effdb000000-3effdc000000 rw-s 00000000 00:27 61162578
> >> /mnt/huge/rtemap_42
> >> 3effdc000000-3effdd000000 rw-s 00000000 00:27 61162577
> >> /mnt/huge/rtemap_41
> >> 3effdd000000-3effde000000 rw-s 00000000 00:27 61162574
> >> /mnt/huge/rtemap_38
> >> 3effde000000-3effdf000000 rw-s 00000000 00:27 61162573
> >> /mnt/huge/rtemap_37
> >> 3effdf000000-3effe0000000 rw-s 00000000 00:27 61162575
> >> /mnt/huge/rtemap_39
> >> 3effe0000000-3effe1000000 rw-s 00000000 00:27 61162576
> >> /mnt/huge/rtemap_40
> >> 3effe1000000-3effe2000000 rw-s 00000000 00:27 61162584
> >> /mnt/huge/rtemap_48
> >> 3effe2000000-3effe3000000 rw-s 00000000 00:27 61162585
> >> /mnt/huge/rtemap_49
> >> 3effe3000000-3effe4000000 rw-s 00000000 00:27 61162586
> >> /mnt/huge/rtemap_50
> >> 3effe4000000-3effe5000000 rw-s 00000000 00:27 61162587
> >> /mnt/huge/rtemap_51
> >> 3effe5000000-3effe6000000 rw-s 00000000 00:27 61162599
> >> /mnt/huge/rtemap_63
> >> 3effe6000000-3effe7000000 rw-s 00000000 00:27 61162598
> >> /mnt/huge/rtemap_62
> >> 3effe7000000-3effe8000000 rw-s 00000000 00:27 61162597
> >> /mnt/huge/rtemap_61
> >> 3effe8000000-3effe9000000 rw-s 00000000 00:27 61162596
> >> /mnt/huge/rtemap_60
> >> 3effe9000000-3effea000000 rw-s 00000000 00:27 61162595
> >> /mnt/huge/rtemap_59
> >> 3effea000000-3effeb000000 rw-s 00000000 00:27 61162594
> >> /mnt/huge/rtemap_58
> >> 3effeb000000-3effec000000 rw-s 00000000 00:27 61162593
> >> /mnt/huge/rtemap_57
> >> 3effec000000-3effed000000 rw-s 00000000 00:27 61162592
> >> /mnt/huge/rtemap_56
> >> 3effed000000-3effee000000 rw-s 00000000 00:27 61162591
> >> /mnt/huge/rtemap_55
> >> 3effee000000-3effef000000 rw-s 00000000 00:27 61162590
> >> /mnt/huge/rtemap_54
> >> 3effef000000-3efff0000000 rw-s 00000000 00:27 61162589
> >> /mnt/huge/rtemap_53
> >> 3efff0000000-3efff1000000 rw-s 00000000 00:27 61162588
> >> /mnt/huge/rtemap_52
> >> 3efff1000000-3efff2000000 rw-s 00000000 00:27 61162565
> >> /mnt/huge/rtemap_29
> >> 3efff2000000-3efff3000000 rw-s 00000000 00:27 61162564
> >> /mnt/huge/rtemap_28
> >> 3efff3000000-3efff4000000 rw-s 00000000 00:27 61162563
> >> /mnt/huge/rtemap_27
> >> 3efff4000000-3efff5000000 rw-s 00000000 00:27 61162562
> >> /mnt/huge/rtemap_26
> >> 3efff5000000-3efff6000000 rw-s 00000000 00:27 61162561
> >> /mnt/huge/rtemap_25
> >> 3efff6000000-3efff7000000 rw-s 00000000 00:27 61162560
> >> /mnt/huge/rtemap_24
> >> 3efff7000000-3efff8000000 rw-s 00000000 00:27 61162559
> >> /mnt/huge/rtemap_23
> >> 3efff8000000-3efff9000000 rw-s 00000000 00:27 61162558
> >> /mnt/huge/rtemap_22
> >> 3efff9000000-3efffa000000 rw-s 00000000 00:27 61162557
> >> /mnt/huge/rtemap_21
> >> 3efffa000000-3efffb000000 rw-s 00000000 00:27 61162556
> >> /mnt/huge/rtemap_20
> >> 3efffb000000-3efffc000000 rw-s 00000000 00:27 61162555
> >> /mnt/huge/rtemap_19
> >> 3efffc000000-3efffd000000 rw-s 00000000 00:27 61162554
> >> /mnt/huge/rtemap_18
> >> 3efffd000000-3efffe000000 rw-s 00000000 00:27 61162553
> >> /mnt/huge/rtemap_17
> >> 3efffe000000-3effff000000 rw-s 00000000 00:27 61162552
> >> /mnt/huge/rtemap_16
> >> 3effff000000-3f0000000000 rw-s 00000000 00:27 61162551
> >> /mnt/huge/rtemap_15
> >> 3fffb7bc0000-3fffb7c10000 rw-p 00000000 00:00 0
> >> 3fffb7c10000-3fffb7c50000 rw-s 00000000 00:12 3926240 
> >> /run/.rte_config
> >> 3fffb7c50000-3fffb7c70000 rw-p 00000000 00:00 0
> >> 3fffb7c70000-3fffb7e20000 r-xp 00000000 08:32 7090531 
> >> /opt/at7.1/lib64/power8/libc-2.19.so
> >> 3fffb7e20000-3fffb7e30000 rw-p 001a0000 08:32 7090531 
> >> /opt/at7.1/lib64/power8/libc-2.19.so
> >> 3fffb7e30000-3fffb7e50000 rw-p 00000000 00:00 0
> >> 3fffb7e50000-3fffb7e70000 r-xp 00000000 08:32 7090563 
> >> /opt/at7.1/lib64/power8/libpthread-2.19.so
> >> 3fffb7e70000-3fffb7e80000 rw-p 00010000 08:32 7090563 
> >> /opt/at7.1/lib64/power8/libpthread-2.19.so
> >> 3fffb7e80000-3fffb7e90000 r-xp 00000000 08:32 7090210 
> >> /opt/at7.1/lib64/libdl-2.19.so
> >> 3fffb7e90000-3fffb7ea0000 rw-p 00000000 08:32 7090210 
> >> /opt/at7.1/lib64/libdl-2.19.so
> >> 3fffb7ea0000-3fffb7ec0000 r-xp 00000000 08:32 7090533
> >> /opt/at7.1/lib64/power8/libz.so.1.2.6
> >> 3fffb7ec0000-3fffb7ed0000 rw-p 00010000 08:32 7090533
> >> /opt/at7.1/lib64/power8/libz.so.1.2.6
> >> 3fffb7ed0000-3fffb7f90000 r-xp 00000000 08:32 7090568 
> >> /opt/at7.1/lib64/power8/libm-2.19.so
> >> 3fffb7f90000-3fffb7fa0000 rw-p 000b0000 08:32 7090568 
> >> /opt/at7.1/lib64/power8/libm-2.19.so
> >> 3fffb7fa0000-3fffb7fc0000 r-xp 00000000 00:00 0 [vdso]
> >> 3fffb7fc0000-3fffb7ff0000 r-xp 00000000 08:32 7090048 
> >> /opt/at7.1/lib64/ld-2.19.so
> >> 3fffb7ff0000-3fffb8000000 rw-p 00020000 08:32 7090048 
> >> /opt/at7.1/lib64/ld-2.19.so
> >> 3ffffffd0000-400000000000 rw-p 00000000 00:00 0 [stack]
> >>
> >>
> >> -----Original Message-----
> >> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> >> Sent: 2016年3月23日 1:11
> >> To: Sergio Gonzalez Monroy <sergio.gonzalez.monroy@intel.com>
> >> Cc: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>; dev@dpdk.org; 
> >> chaozhu@linux.vnet.ibm.com; David Marchand 
> >> <david.marchand@6wind.com>
> >> Subject: Re: [dpdk-dev] [PATCH] eal/ppc: fix secondary process to 
> >> map hugepages in correct order
> >>
> >> On Tue, Mar 22, 2016 at 04:35:32PM +0000, Sergio Gonzalez Monroy wrote:
> >>> First of all, forgive my ignorance regarding ppc64 and if the 
> >>> questions are naive but after having a look to the already 
> >>> existing code for ppc64 and this patch now, why are we doing this 
> >>> reverse mapping at all?
> >>>
> >>> I guess the question revolves around the comment in eal_memory.c:
> >>> 1316                 /* On PPC64 architecture, the mmap always start from
> >>> higher
> >>> 1317                  * virtual address to lower address. Here, both the
> >>> physical
> >>> 1318                  * address and virtual address are in descending
> >> order
> >>> */
> >>>
> >>>   From looking at the code, for ppc64 we do qsort in reverse order 
> >>> and thereafter everything looks to be is done to account for that 
> >>> reverse sorting.
> >>>
> >>> CC: Chao Zhu and David Marchand as original author and reviewer of 
> >>> the
> >> code.
> >>> Sergio
> >>>
> >> Just to add my 2c here. At one point, with I believe some i686 
> >> installs - don't remember the specific OS/kernel, we found that the 
> >> mmap calls were returning the highest free address first and then 
> >> working downwards - must like seems to be described here. To fix 
> >> this we changed the mmap code from assuming that addresses are 
> >> mapped upwards, to instead explicitly requesting a large free block 
> >> of memory (mmap of /dev/zero) to find a free address space range of 
> >> the correct size, and then explicitly mmapping each individual page 
> >> to the appropriate place in that free range. With this scheme it 
> >> didn't matter whether the OS tried to mmap the pages from the 
> >> highest or lowest address because we always told the OS where to put the page (and we knew the slot was free from the earlier block mmap).
> >> Would this scheme not also work for PPC in a similar way? (Again, 
> >> forgive unfamiliarity with PPC! :-) )
> >>
> >> /Bruce
> >>
> >>> On 07/03/2016 14:13, Gowrishankar wrote:
> >>>> From: Gowri Shankar <gowrishankar.m@linux.vnet.ibm.com>
> >>>>
> >>>> For a secondary process address space to map hugepages from every 
> >>>> segment of primary process, hugepage_file entries has to be 
> >>>> mapped reversely from the list that primary process updated for 
> >>>> every segment. This is for a reason that, in ppc64, hugepages are 
> >>>> sorted for
> >> decrementing addresses.
> >>>> Signed-off-by: Gowrishankar <gowrishankar.m@linux.vnet.ibm.com>
> >>>> ---
> >
>
  
Thomas Monjalon April 15, 2018, 12:28 p.m. UTC | #12
16/02/2017 08:22, Chao Zhu:
> Thomas,
> 
> We have several different internal fixes and didn't get a conclusion. Let me summarize them and give a final patch sets.
> Thanks for your reminder!

This patch is now classified as rejected.
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..6aea5d0 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -1400,7 +1400,7 @@  rte_eal_hugepage_attach(void)
 {
 	const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
 	const struct hugepage_file *hp = NULL;
-	unsigned num_hp = 0;
+	unsigned num_hp = 0, mapped_hp = 0;
 	unsigned i, s = 0; /* s used to track the segment number */
 	off_t size;
 	int fd, fd_zero = -1, fd_hugepage = -1;
@@ -1486,14 +1486,12 @@  rte_eal_hugepage_attach(void)
 		goto error;
 	}
 
-	num_hp = size / sizeof(struct hugepage_file);
-	RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
-
 	s = 0;
 	while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
 		void *addr, *base_addr;
 		uintptr_t offset = 0;
 		size_t mapping_size;
+		unsigned int index;
 #ifdef RTE_LIBRTE_IVSHMEM
 		/*
 		 * if segment has ioremap address set, it's an IVSHMEM segment and
@@ -1504,6 +1502,8 @@  rte_eal_hugepage_attach(void)
 			continue;
 		}
 #endif
+		num_hp = mcfg->memseg[s].len / mcfg->memseg[s].hugepage_sz;
+		RTE_LOG(DEBUG, EAL, "Analysing %u files in segment %u\n", num_hp, s);
 		/*
 		 * free previously mapped memory so we can map the
 		 * hugepages into the space
@@ -1514,18 +1514,23 @@  rte_eal_hugepage_attach(void)
 		/* find the hugepages for this segment and map them
 		 * we don't need to worry about order, as the server sorted the
 		 * entries before it did the second mmap of them */
+#ifdef RTE_ARCH_PPC_64
+		for (i = num_hp-1; i < num_hp && offset < mcfg->memseg[s].len; i--){
+#else
 		for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
-			if (hp[i].memseg_id == (int)s){
-				fd = open(hp[i].filepath, O_RDWR);
+#endif
+			index = i + mapped_hp;
+			if (hp[index].memseg_id == (int)s){
+				fd = open(hp[index].filepath, O_RDWR);
 				if (fd < 0) {
 					RTE_LOG(ERR, EAL, "Could not open %s\n",
-						hp[i].filepath);
+						hp[index].filepath);
 					goto error;
 				}
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
-				mapping_size = hp[i].size * hp[i].repeated;
+				mapping_size = hp[index].size * hp[index].repeated;
 #else
-				mapping_size = hp[i].size;
+				mapping_size = hp[index].size;
 #endif
 				addr = mmap(RTE_PTR_ADD(base_addr, offset),
 						mapping_size, PROT_READ | PROT_WRITE,
@@ -1534,7 +1539,7 @@  rte_eal_hugepage_attach(void)
 				if (addr == MAP_FAILED ||
 						addr != RTE_PTR_ADD(base_addr, offset)) {
 					RTE_LOG(ERR, EAL, "Could not mmap %s\n",
-						hp[i].filepath);
+						hp[index].filepath);
 					goto error;
 				}
 				offset+=mapping_size;
@@ -1543,6 +1548,7 @@  rte_eal_hugepage_attach(void)
 		RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
 				(unsigned long long)mcfg->memseg[s].len);
 		s++;
+		mapped_hp += num_hp;
 	}
 	/* unmap the hugepage config file, since we are done using it */
 	munmap((void *)(uintptr_t)hp, size);