[dpdk-dev,RFC] eal: add cgroup-aware resource self discovery

Message ID 1453661393-85704-1-git-send-email-jianfeng.tan@intel.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Jianfeng Tan Jan. 24, 2016, 6:49 p.m. UTC
  Current issue: DPDK is not that friendly to container environment usage.
It's because that it pre-alloc resource like cores and hugepages from cmd
line options. So for a DPDK application, it's necessary to check how much
resource is allocated to a container and then use that as an reference.

To address that, this patch introduces two APIs:
   a. rte_eal_res_self_discovery, to query how much resource can be used.
   b. rte_eal_res_self_discovery_apply, to apply self-discovered resource
      into DPDK.

Currently only Linux CGroup is added, similarly, we can add BSD jail as
well in the future. And even in Linux, there could be other way to query
and apply resources, like through a centralized daemon.

Known issue: current way to read individual attributes of cgroups directly
instead of via systemd's API is not a long-term solution. Please refer to
http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
for more information.

Test example:
    a. cgcreate -g cpuset,hugetlb:/test-subgroup
    b. cgset -r cpuset.cpus=2-3 test-subgroup
    c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
    d. cgexec -g cpuset,hugetlb:test-subgroup \
	    ./examples/l2fwd/build/l2fwd --self-discovery=cgroup -n 4 -- -p 3

Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
---
 lib/librte_eal/common/eal_common_options.c      |  39 ++++
 lib/librte_eal/common/eal_internal_cfg.h        |   1 +
 lib/librte_eal/common/eal_options.h             |   2 +
 lib/librte_eal/common/include/rte_eal.h         |  34 +++
 lib/librte_eal/linuxapp/eal/Makefile            |   1 +
 lib/librte_eal/linuxapp/eal/eal_cgroup.c        | 294 ++++++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |   5 +
 7 files changed, 376 insertions(+)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_cgroup.c
  

Comments

Neil Horman Jan. 25, 2016, 1:46 p.m. UTC | #1
On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> Current issue: DPDK is not that friendly to container environment usage.
> It's because that it pre-alloc resource like cores and hugepages from cmd
> line options. So for a DPDK application, it's necessary to check how much
> resource is allocated to a container and then use that as an reference.
> 
> To address that, this patch introduces two APIs:
>    a. rte_eal_res_self_discovery, to query how much resource can be used.
>    b. rte_eal_res_self_discovery_apply, to apply self-discovered resource
>       into DPDK.
> 
> Currently only Linux CGroup is added, similarly, we can add BSD jail as
> well in the future. And even in Linux, there could be other way to query
> and apply resources, like through a centralized daemon.
> 
> Known issue: current way to read individual attributes of cgroups directly
> instead of via systemd's API is not a long-term solution. Please refer to
> http://www.freedesktop.org/wiki/Software/systemd/ControlGroupInterface/
> for more information.
> 
> Test example:
>     a. cgcreate -g cpuset,hugetlb:/test-subgroup
>     b. cgset -r cpuset.cpus=2-3 test-subgroup
>     c. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>     d. cgexec -g cpuset,hugetlb:test-subgroup \
> 	    ./examples/l2fwd/build/l2fwd --self-discovery=cgroup -n 4 -- -p 3
> 
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> ---
>  lib/librte_eal/common/eal_common_options.c      |  39 ++++
>  lib/librte_eal/common/eal_internal_cfg.h        |   1 +
>  lib/librte_eal/common/eal_options.h             |   2 +
>  lib/librte_eal/common/include/rte_eal.h         |  34 +++
>  lib/librte_eal/linuxapp/eal/Makefile            |   1 +
>  lib/librte_eal/linuxapp/eal/eal_cgroup.c        | 294 ++++++++++++++++++++++++
>  lib/librte_eal/linuxapp/eal/eal_hugepage_info.c |   5 +
>  7 files changed, 376 insertions(+)
>  create mode 100644 lib/librte_eal/linuxapp/eal/eal_cgroup.c
> 
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..7235473 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>  	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>  	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>  	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +	{OPT_SELF_DISCOVERY,    1, NULL, OPT_SELF_DISCOVERY_NUM   },
>  	{0,                     0, NULL, 0                        }
>  };
>  
> @@ -128,6 +129,7 @@ eal_reset_internal_config(struct internal_config *internal_cfg)
>  	internal_cfg->force_nchannel = 0;
>  	internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
>  	internal_cfg->hugepage_dir = NULL;
> +	internal_cfg->self_discovery = NULL;
>  	internal_cfg->force_sockets = 0;
>  	/* zero out the NUMA config */
>  	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> @@ -755,6 +757,24 @@ eal_parse_proc_type(const char *arg)
>  }
>  
>  int
> +__attribute__((weak))
> +rte_eal_res_self_discovery(const char *type __rte_unused,
> +			   char **p_corelist __rte_unused,
> +			   uint64_t *p_memory __rte_unused)
> +{
> +	return -1;
> +}
> +
> +int
> +__attribute__((weak))
> +rte_eal_res_self_discovery_apply(const char *type __rte_unused,
> +				 int enable_core __rte_unused,
> +				 int enable_mem __rte_unused)
> +{
> +	return -1;
> +}
> +
> +int
>  eal_parse_common_option(int opt, const char *optarg,
>  			struct internal_config *conf)
>  {
> @@ -897,6 +917,25 @@ eal_parse_common_option(int opt, const char *optarg,
>  		}
>  		break;
>  
> +	case OPT_SELF_DISCOVERY_NUM: {
> +		char *corelist;
> +
> +		if (rte_eal_res_self_discovery(optarg, &corelist, NULL) < 0) {
> +			RTE_LOG(ERR, EAL, "invalid parameter for --"
> +				OPT_SELF_DISCOVERY "\n");
> +			return -1;
> +		}
> +
> +		if (eal_parse_corelist(corelist) < 0) {
> +			RTE_LOG(ERR, EAL, "invalid core list\n");
> +			return -1;
> +		}
> +		/* Save it here for memory limit */
> +		internal_config.self_discovery = strdup(optarg);
> +
> +		break;
> +	}
> +
>  	/* don't know what to do, leave this to caller */
>  	default:
>  		return 1;
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..f3c8e31 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -83,6 +83,7 @@ struct internal_config {
>  	volatile enum rte_intr_mode vfio_intr_mode;
>  	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
>  	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
> +	const char *self_discovery;       /**< specific type of self_discovery */
>  
>  	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
>  	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62..a499d73 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>  	OPT_VMWARE_TSC_MAP_NUM,
>  #define OPT_XEN_DOM0          "xen-dom0"
>  	OPT_XEN_DOM0_NUM,
> +#define OPT_SELF_DISCOVERY    "self-discovery"
> +	OPT_SELF_DISCOVERY_NUM,
>  	OPT_LONG_MAX_NUM
>  };
>  
> diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
> index d2816a8..ff81484 100644
> --- a/lib/librte_eal/common/include/rte_eal.h
> +++ b/lib/librte_eal/common/include/rte_eal.h
> @@ -220,6 +220,40 @@ int rte_eal_has_hugepages(void);
>  int rte_sys_gettid(void);
>  
>  /**
> + * An API to query resource self discovery.
> + *
> + * @type
> + *   Type of self resource discovery.
> + * @p_corelist
> + *   If succeed, fill core list which can be used. Caller to free.
> + * @p_memory
> + *   If succeed, fill how many (bytes) memory can be used.
> + *
> + * @return
> + *   - (-1), if failed.
> + *   - 0, if succeed.
> + */
> +int rte_eal_res_self_discovery(const char *type,
> +			       char **p_corelist, uint64_t *p_memory);
> +/**
> + * An API to apply resource through self discovery.
> + *
> + * @type
> + *   Type of self resource discovery.
> + * @enable_core
> + *   If succeed, apply core resource.
> + * @p_memory
> + *   If succeed, apply memory resource.
> + *
> + * @return
> + *   - (-1), if failed.
> + *   - 0, if succeed.
> + */
> +int rte_eal_res_self_discovery_apply(const char *type,
> +				     int enable_core, int enable_mem);
> +
> +
> +/**
>   * Get system unique thread id.
>   *
>   * @return
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
> index 26eced5..834ae2f 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -87,6 +87,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_devargs.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_dev.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_options.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_thread.c
> +SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_cgroup.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += rte_malloc.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_elem.c
>  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_heap.c
> diff --git a/lib/librte_eal/linuxapp/eal/eal_cgroup.c b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
> new file mode 100644
> index 0000000..d6a04ee
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
> @@ -0,0 +1,294 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <string.h>
> +#include <stdlib.h>
> +#include <sys/stat.h>
> +#include <sys/file.h>
> +#include <sys/types.h>
> +#include <dirent.h>
> +#include <mntent.h>
> +#include <inttypes.h>
> +
> +#include <rte_log.h>
> +#include <rte_eal.h>
> +#include <rte_common.h>
> +
> +#include "eal_internal_cfg.h"
> +
> +static int pid;
> +
> +static char *
> +find_controller_dfs(const char *dir_path)
> +{
> +	FILE *f;
> +	char *line;
> +	char *ret;
> +	size_t len;
> +	ssize_t read;
> +	DIR *dir;
> +	struct dirent *ent;
> +	char filepath[PATH_MAX];
> +
> +	// 1. check if this process belongs to this cgroup
> +	snprintf(filepath, sizeof(filepath)-1, "%s/tasks", dir_path);
> +	f = fopen(filepath, "r");
> +	if (f == NULL)
> +		return NULL;
> +	len = 0;
> +	line = NULL;
> +	while ((read = getline(&line, &len, f)) != -1) {
> +		int _pid = atoi(line);
> +		free(line);
> +		if (_pid == pid)
> +			break;
> +		len = 0;
> +		line = NULL;
> +	}
> +	fclose(f);
> +	if (read != -1)
> +		return strdup(dir_path);
> +
> +	// 2. check its child cgroup
> +	if (!(dir = opendir(dir_path)))
> +		return NULL;
> +
> +	ret = NULL;
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (ent->d_type != DT_DIR)
> +			continue;
> +		if (strcmp(ent->d_name, ".") == 0 ||
> +		    strcmp(ent->d_name, "..") == 0)
> +			continue;
> +
> +		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
> +			 dir_path, ent->d_name);
> +
> +		ret = find_controller_dfs(filepath);
> +		if (ret != NULL)
> +			break;
> +	}
> +
> +	closedir(dir);
> +	return ret;
> +}
> +
> +static char *
> +find_controller(const char *controller)
> +{
> +	FILE *f;
> +	char *path;
> +	struct mntent *ent;
> +
> +	static const char *proc_mounts = "/proc/mounts";
> +	static const char *fs_type = "cgroup";
> +
> +	f = setmntent(proc_mounts, "r");
> +	if (f == NULL) {
> +		RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_mounts);
> +		return NULL;
> +	}
> +
> +	while (NULL != (ent = getmntent(f))) {
> +		if (strcmp(ent->mnt_type, fs_type) != 0)
> +			continue;
> +		if (hasmntopt(ent, controller) == NULL)
> +			continue;
> +		break;
> +	}
> +
> +	if (ent == NULL) {
> +		path = NULL;
> +		goto end;
> +	}
> +
> +	path = find_controller_dfs(ent->mnt_dir);
> +end:
> +	endmntent(f);
> +	return path;
> +}
> +
> +static inline char *
> +get_oneline_from_file(const char *path)
> +{
> +	FILE *f;
> +	char *line = NULL;
> +	size_t len = 0;
> +
> +	if (NULL == (f = fopen(path, "r")))
> +		return NULL;
> +	if (getline(&line, &len, f) == -1)
> +		line = NULL;
> +	line[strcspn(line, "\n")] = 0;
> +	fclose(f);
> +	return line;
> +}
> +
> +static int
> +cgroup_cpuset(char **p_corelist, int enable __rte_unused)
> +{
> +	char filepath[PATH_MAX];
> +	char *controller;
> +
> +       	controller = find_controller("cpuset");
> +	if (controller == NULL)
> +		return -1;
> +
> +	snprintf(filepath, sizeof(filepath)-1, "%s/cpuset.cpus", controller);
> +	*p_corelist = get_oneline_from_file(filepath);
> +	RTE_LOG(INFO, EAL, "cgroup cpuset: %s\n", *p_corelist);
> +	return 0;
> +
> +}
> +
> +static inline uint64_t
> +get_hugetlb_limit(const char *path)
> +{
> +	uint64_t limit;
> +	char *str;
> +
> +       	str = get_oneline_from_file(path);
> +	sscanf(str, "%"PRIu64, &limit);
> +	free(str);
> +	return limit;
> +}
> +
> +static int
> +cgroup_hugetlb(uint64_t *p_memory, int enable)
> +{
> +	unsigned i;
> +	char filepath[PATH_MAX];
> +	char *controller;
> +	DIR *dir;
> +	struct dirent *ent;
> +	uint64_t memory = 0;
> +	static char prefix[] = "hugetlb";
> +	static int prefix_len = sizeof(prefix) - 1;
> +	static char suffix[] = "limit_in_bytes";
> +
> +       	controller = find_controller("hugetlb");
> +	if (controller == NULL)
> +		return -1;
> +
> +	if (!(dir = opendir(controller)))
> +		return -1;
> +
> +	while ((ent = readdir(dir)) != NULL) {
> +		if (strncmp(ent->d_name, prefix, prefix_len) != 0)
> +			continue;
> +
> +		char *sz_beg = ent->d_name + prefix_len + 1;
> +		char *sz_end = strchr(sz_beg, '.');
> +
> +		if (strcmp(sz_end + 1, suffix) != 0)
> +			continue;
> +
> +		char *tmp = strndup(sz_beg, sz_end - sz_beg);
> +		uint64_t pagesize = rte_str_to_size(tmp);
> +		free(tmp);
> +
> +		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
> +			 controller, ent->d_name);
> +		uint64_t m_limit = get_hugetlb_limit(filepath);
> +		memory += m_limit;
> +
> +		/* Record those information into internal_config if hugepages
> +		 * are already initialized.
> +		 */
> +		if (! enable)
> +			continue;
> +		for (i = 0; i < internal_config.num_hugepage_sizes; ++i) {
> +			struct hugepage_info *hp;
> +
> +		       	hp = &internal_config.hugepage_info[i];
> +			if (hp->hugepage_sz != pagesize)
> +				continue;
> +
> +			if (m_limit < hp->hugepage_sz * hp->num_pages[0])
> +				hp->num_pages[0] = m_limit / hp->hugepage_sz;
> +		}
> +	}
> +
> +	closedir(dir);
> +	*p_memory = memory;
> +	RTE_LOG(INFO, EAL, "cgroup hugetlb: %"PRIx64"\n", *p_memory);
> +	return 0;
> +}
> +
> +static int
> +resource_self_discovery(const char *type, char **p_corelist, int enable_core,
> +			uint64_t *p_memory, int enable_mem)
> +{
> +	if (strcmp(type, "cgroup") != 0) {
> +		RTE_LOG(ERR, EAL, "type not supported: %s\n", type);
> +		return -1;
> +	}
> +
> +	pid = getpid();
> +
> +	if (p_corelist != NULL && cgroup_cpuset(p_corelist, enable_core) < 0) {
> +		RTE_LOG(ERR, EAL, "Failed when discover resource cpuset\n");
> +		return -1;
> +	}
> +	if (p_memory != NULL && cgroup_hugetlb(p_memory, enable_mem) < 0) {
> +		RTE_LOG(ERR, EAL, "Failed when discover resource hugetlb\n");
> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +int
> +rte_eal_res_self_discovery(const char *type, char **p_corelist,
> +			   uint64_t *p_memory)
> +{
> +	return resource_self_discovery(type, p_corelist, 0, p_memory, 0);
> +}
> +
> +int
> +rte_eal_res_self_discovery_apply(const char *type, int enable_core,
> +				 int enable_mem)
> +{
> +	char *corelist, **pc = NULL;
> +	uint64_t mem, *pm = NULL;
> +	
> +	if (enable_core)
> +		pc = &corelist;
> +	if (enable_mem)
> +		pm = &mem;
> +
> +	return resource_self_discovery(type, pc, enable_core,
> +				       pm, enable_mem);
> +}
> diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> index 18858e2..a6b6548 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> @@ -354,6 +354,11 @@ eal_hugepage_info_init(void)
>  	qsort(&internal_config.hugepage_info[0], num_sizes,
>  	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
>  
> +	/* Apply cgroup hugetlb limit before we really use hugepages */
> +	if (internal_config.self_discovery)
> +		rte_eal_res_self_discovery_apply(internal_config.self_discovery,
> +						 0, 1);
> +
>  	/* now we have all info, check we have at least one valid size */
>  	for (i = 0; i < num_sizes; i++)
>  		if (internal_config.hugepage_info[i].hugedir != NULL &&
> -- 
> 2.1.4
> 
> 


This doesn't make a whole lot of sense, for several reasons:

1) Applications, as a general rule shouldn't be interrogating the cgroups
interface at all.  

2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
(the isolcpus command line argument, or a taskset on a parent process for
instance, but there are several others).

Instead of trying to figure out what cpuset is valid for your process by
interrogating the cgroups heirarchy, instead you should follow the proscribed
method of calling sched_getaffinity after calling sched_setaffinity.  That will
give you the canonical cpuset that you are executing on, taking all cpuset
filters into account (including cgroups and any other restrictions).  Its far
simpler as well, as it doesn't require a ton of file/string processing.

Neil
  
Jianfeng Tan Jan. 26, 2016, 2:22 a.m. UTC | #2
Hi Neil,

On 1/25/2016 9:46 PM, Neil Horman wrote:
> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
...
>> -- 
>> 2.1.4
>>
>>
>
> This doesn't make a whole lot of sense, for several reasons:
>
> 1) Applications, as a general rule shouldn't be interrogating the cgroups
> interface at all.

The main reason to do this in DPDK is that DPDK obtains resource 
information from sysfs and proc, which are not well containerized so 
far. And DPDK pre-allocates resource instead of on-demand gradual 
allocating.

>
> 2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> (the isolcpus command line argument, or a taskset on a parent process for
> instance, but there are several others).

Yes, I agree. To enable that, I'd like design the new API for resource 
self discovery in a flexible way. A parameter "type" is used to specify 
the solution to discovery way. In addition, I'm considering to add a 
callback function pointer so that users can write their own resource 
discovery functions.

>
> Instead of trying to figure out what cpuset is valid for your process by
> interrogating the cgroups heirarchy, instead you should follow the proscribed
> method of calling sched_getaffinity after calling sched_setaffinity.  That will
> give you the canonical cpuset that you are executing on, taking all cpuset
> filters into account (including cgroups and any other restrictions).  Its far
> simpler as well, as it doesn't require a ton of file/string processing.

Yes, this way is much better for cpuset discovery. But is there such a 
syscall for hugepages?

Thanks,
Jianfeng

>
> Neil
>
  
Neil Horman Jan. 26, 2016, 2:19 p.m. UTC | #3
On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
> 
> Hi Neil,
> 
> On 1/25/2016 9:46 PM, Neil Horman wrote:
> >On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> ...
> >>-- 
> >>2.1.4
> >>
> >>
> >
> >This doesn't make a whole lot of sense, for several reasons:
> >
> >1) Applications, as a general rule shouldn't be interrogating the cgroups
> >interface at all.
> 
> The main reason to do this in DPDK is that DPDK obtains resource information
> from sysfs and proc, which are not well containerized so far. And DPDK
> pre-allocates resource instead of on-demand gradual allocating.
> 
Not disagreeing with this, just suggesting that:

1) Interrogating cgroups really isn't the best way to collect that information
2) Pre-allocating those resources isn't particularly wise without some mechanism
to reallocate it, as resource constraints can change (consider your cpuset
getting rewritten)

> >
> >2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> >(the isolcpus command line argument, or a taskset on a parent process for
> >instance, but there are several others).
> 
> Yes, I agree. To enable that, I'd like design the new API for resource self
> discovery in a flexible way. A parameter "type" is used to specify the
> solution to discovery way. In addition, I'm considering to add a callback
> function pointer so that users can write their own resource discovery
> functions.
> 
Why?  You don't need an API for this, or if you really want one, it can be very
generic if you use POSIX apis to gather the information.  What you have here is
going to be very linux specific, and will need reimplementing for BSD or other
operating systems.  To use the cpuset example, instead of reading and parsing
the mask files in the cgroup filesystem module to find your task and
corresponding mask, just call sched_setaffinity with an all f's mask, then call
sched_getaffinity.  The returned mask will be all the cpus your process is
allowed to execute on, taking into account every limiting filter the system you
are running on offers.

There are simmilar OS level POSIX apis for most resources out there.  You really
don't need to dig through cgroups just to learn what some of those reources are.

> >
> >Instead of trying to figure out what cpuset is valid for your process by
> >interrogating the cgroups heirarchy, instead you should follow the proscribed
> >method of calling sched_getaffinity after calling sched_setaffinity.  That will
> >give you the canonical cpuset that you are executing on, taking all cpuset
> >filters into account (including cgroups and any other restrictions).  Its far
> >simpler as well, as it doesn't require a ton of file/string processing.
> 
> Yes, this way is much better for cpuset discovery. But is there such a
> syscall for hugepages?
> 
In what capacity?  Interrogating how many hugepages you have, or to what node
they are affined to?  Capacity would require reading the requisite proc file, as
theres no posix api for this resource.  Node affinity can be implied by setting
the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
kernel will attempt to distribute hugepages evenly among the tasks' numa policy
configuration.

That said, I would advise that you strongly consider not exporting hugepages as
a resource, as:

a) Applications generally don't need to know that they are using hugepages, and
so they dont need to know where said hugepages live, they just allocate memory
via your allocation api and you give them something appropriate

b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
that.  Some OS implement simmilar resources, but they may have very different
semantics.  And other Arches may or may not implement various forms of compound
paging at all.  As the DPDK expands to support more OS'es and arches, it would
be nice to ensure that the programming surfaces that you expose have a more
broad level of support.

Neil

> Thanks,
> Jianfeng
> 
> >
> >Neil
> >
> 
>
  
Jianfeng Tan Jan. 27, 2016, 12:02 p.m. UTC | #4
Hi Neil,

On 1/26/2016 10:19 PM, Neil Horman wrote:
> On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
>> Hi Neil,
>>
>> On 1/25/2016 9:46 PM, Neil Horman wrote:
>>> On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
>> ...
>>>> -- 
>>>> 2.1.4
>>>>
>>>>
>>> This doesn't make a whole lot of sense, for several reasons:
>>>
>>> 1) Applications, as a general rule shouldn't be interrogating the cgroups
>>> interface at all.
>> The main reason to do this in DPDK is that DPDK obtains resource information
>> from sysfs and proc, which are not well containerized so far. And DPDK
>> pre-allocates resource instead of on-demand gradual allocating.
>>
> Not disagreeing with this, just suggesting that:
>
> 1) Interrogating cgroups really isn't the best way to collect that information
> 2) Pre-allocating those resources isn't particularly wise without some mechanism
> to reallocate it, as resource constraints can change (consider your cpuset
> getting rewritten)

In the case of reallocate,
For cpuset, DPDK panics in the initialization if set_affinity fails, but 
after that, cpuset rewritten will not bring any problem I believe.
For memory, a running application uses 2G hugepages, then admin 
decreases hugetlb cgroup into 1G, the application will not get killed, 
unless it tries to access more hugepages (I'll double check this).

So another way to address this problem is to add an option that DPDK 
tries best to allocate those resources, and if fails, it just posts a 
warning and uses those allocated resources, instead of panic. What do 
you think?

>
>>> 2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
>>> (the isolcpus command line argument, or a taskset on a parent process for
>>> instance, but there are several others).
>> Yes, I agree. To enable that, I'd like design the new API for resource self
>> discovery in a flexible way. A parameter "type" is used to specify the
>> solution to discovery way. In addition, I'm considering to add a callback
>> function pointer so that users can write their own resource discovery
>> functions.
>>
> Why?  You don't need an API for this, or if you really want one, it can be very
> generic if you use POSIX apis to gather the information.  What you have here is
> going to be very linux specific, and will need reimplementing for BSD or other
> operating systems.  To use the cpuset example, instead of reading and parsing
> the mask files in the cgroup filesystem module to find your task and
> corresponding mask, just call sched_setaffinity with an all f's mask, then call
> sched_getaffinity.  The returned mask will be all the cpus your process is
> allowed to execute on, taking into account every limiting filter the system you
> are running on offers.

Yes, it makes sense on cpu's side.

>
> There are simmilar OS level POSIX apis for most resources out there.  You really
> don't need to dig through cgroups just to learn what some of those reources are.
>
>>> Instead of trying to figure out what cpuset is valid for your process by
>>> interrogating the cgroups heirarchy, instead you should follow the proscribed
>>> method of calling sched_getaffinity after calling sched_setaffinity.  That will
>>> give you the canonical cpuset that you are executing on, taking all cpuset
>>> filters into account (including cgroups and any other restrictions).  Its far
>>> simpler as well, as it doesn't require a ton of file/string processing.
>> Yes, this way is much better for cpuset discovery. But is there such a
>> syscall for hugepages?
>>
> In what capacity?  Interrogating how many hugepages you have, or to what node
> they are affined to?  Capacity would require reading the requisite proc file, as
> theres no posix api for this resource.  Node affinity can be implied by setting
> the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
> kernel will attempt to distribute hugepages evenly among the tasks' numa policy
> configuration.

For memory affinity, I believe the existing way of reading 
/proc/self/pagemap already handle the problem. What I was asking is how 
much memory (or hugepages in Linux's case) can be used. By the way, what 
is /proc/nr_hugepages?

>
> That said, I would advise that you strongly consider not exporting hugepages as
> a resource, as:
>
> a) Applications generally don't need to know that they are using hugepages, and
> so they dont need to know where said hugepages live, they just allocate memory
> via your allocation api and you give them something appropriate

But the allocation api provider, DPDK library, needs to know if it's 
using hugepages or not.

> b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
> that.  Some OS implement simmilar resources, but they may have very different
> semantics.  And other Arches may or may not implement various forms of compound
> paging at all.  As the DPDK expands to support more OS'es and arches, it would
> be nice to ensure that the programming surfaces that you expose have a more
> broad level of support.

That's why I put current implement in lib/librte_eal/linuxapp/. And the 
new API uses the words of cores and memory, which is very generic IMO. 
In Linux's context, memory is interpreted into hugepages (maybe not 
correct because DPDK can be used with 4K memory). For other OSes, we 
could add similar limitation in their semantics.


Thanks,
Jianfeng

>
> Neil
>
>> Thanks,
>> Jianfeng
>>
>>> Neil
>>>
>>
  
Neil Horman Jan. 27, 2016, 5:30 p.m. UTC | #5
On Wed, Jan 27, 2016 at 08:02:27PM +0800, Tan, Jianfeng wrote:
> Hi Neil,
> 
> On 1/26/2016 10:19 PM, Neil Horman wrote:
> >On Tue, Jan 26, 2016 at 10:22:18AM +0800, Tan, Jianfeng wrote:
> >>Hi Neil,
> >>
> >>On 1/25/2016 9:46 PM, Neil Horman wrote:
> >>>On Mon, Jan 25, 2016 at 02:49:53AM +0800, Jianfeng Tan wrote:
> >>...
> >>>>-- 
> >>>>2.1.4
> >>>>
> >>>>
> >>>This doesn't make a whole lot of sense, for several reasons:
> >>>
> >>>1) Applications, as a general rule shouldn't be interrogating the cgroups
> >>>interface at all.
> >>The main reason to do this in DPDK is that DPDK obtains resource information
> >>from sysfs and proc, which are not well containerized so far. And DPDK
> >>pre-allocates resource instead of on-demand gradual allocating.
> >>
> >Not disagreeing with this, just suggesting that:
> >
> >1) Interrogating cgroups really isn't the best way to collect that information
> >2) Pre-allocating those resources isn't particularly wise without some mechanism
> >to reallocate it, as resource constraints can change (consider your cpuset
> >getting rewritten)
> 
> In the case of reallocate,
> For cpuset, DPDK panics in the initialization if set_affinity fails, but
> after that, cpuset rewritten will not bring any problem I believe.
Yes, that seems reasonable, but I think you need to update
rte_thread_set_affinity to not assume that success in pthread_setaffinity_np
means that all cpus in the provided mask are available.  That is to say, cpusetp
is subsequently stored in lore information after the set, but may not reflect
the actual working set of processors, you should follow a successful set with a
call to pthread_getaffinity_np to retrieve the actual working cpuset

As for subsequent changes to the cpuset, I'm not sure how you want to handle
that. I would think that you might want to run a check periodically or alow for
a SIGHUP or some other signal to trigger a rescan of your working cpuset so as
to keep the application in sync with the system.

> For memory, a running application uses 2G hugepages, then admin decreases
> hugetlb cgroup into 1G, the application will not get killed, unless it tries
> to access more hugepages (I'll double check this).
> 
No, the semantics should be identical to malloc/mmap (if you use the alloc_hugepages
api or the mmap api).  You should get a NULL return or other no fatal indicator
if you allocate more than is available.

> So another way to address this problem is to add an option that DPDK tries
> best to allocate those resources, and if fails, it just posts a warning and
> uses those allocated resources, instead of panic. What do you think?
> 
Yes, that makes sense

> >
> >>>2) Cgroups aren't the only way in which a cpuset or memoryset can be restricted
> >>>(the isolcpus command line argument, or a taskset on a parent process for
> >>>instance, but there are several others).
> >>Yes, I agree. To enable that, I'd like design the new API for resource self
> >>discovery in a flexible way. A parameter "type" is used to specify the
> >>solution to discovery way. In addition, I'm considering to add a callback
> >>function pointer so that users can write their own resource discovery
> >>functions.
> >>
> >Why?  You don't need an API for this, or if you really want one, it can be very
> >generic if you use POSIX apis to gather the information.  What you have here is
> >going to be very linux specific, and will need reimplementing for BSD or other
> >operating systems.  To use the cpuset example, instead of reading and parsing
> >the mask files in the cgroup filesystem module to find your task and
> >corresponding mask, just call sched_setaffinity with an all f's mask, then call
> >sched_getaffinity.  The returned mask will be all the cpus your process is
> >allowed to execute on, taking into account every limiting filter the system you
> >are running on offers.
> 
> Yes, it makes sense on cpu's side.
> 
> >
> >There are simmilar OS level POSIX apis for most resources out there.  You really
> >don't need to dig through cgroups just to learn what some of those reources are.
> >
> >>>Instead of trying to figure out what cpuset is valid for your process by
> >>>interrogating the cgroups heirarchy, instead you should follow the proscribed
> >>>method of calling sched_getaffinity after calling sched_setaffinity.  That will
> >>>give you the canonical cpuset that you are executing on, taking all cpuset
> >>>filters into account (including cgroups and any other restrictions).  Its far
> >>>simpler as well, as it doesn't require a ton of file/string processing.
> >>Yes, this way is much better for cpuset discovery. But is there such a
> >>syscall for hugepages?
> >>
> >In what capacity?  Interrogating how many hugepages you have, or to what node
> >they are affined to?  Capacity would require reading the requisite proc file, as
> >theres no posix api for this resource.  Node affinity can be implied by setting
> >the numa policy of the dpdk and then writing to /proc/nr_hugepages, as the
> >kernel will attempt to distribute hugepages evenly among the tasks' numa policy
> >configuration.
> 
> For memory affinity, I believe the existing way of reading
> /proc/self/pagemap already handle the problem. What I was asking is how much
> memory (or hugepages in Linux's case) can be used. By the way, what is
> /proc/nr_hugepages?
> 
For affinity, you can parse /proc/self/pagemap or any number of other procfiles,
but again, doing so is going to be very OS specific, and doesn't get you much in
terms or resource management. It only tells you where the pages reside now.

/proc/nr_hugepages is the proc tunable that lets you allocate/realocate
hugepages.

> >
> >That said, I would advise that you strongly consider not exporting hugepages as
> >a resource, as:
> >
> >a) Applications generally don't need to know that they are using hugepages, and
> >so they dont need to know where said hugepages live, they just allocate memory
> >via your allocation api and you give them something appropriate
> 
> But the allocation api provider, DPDK library, needs to know if it's using
> hugepages or not.
> 
Right, but you're purpose was to expose thie library to applications.  I'm
saying you really don't need to expose such a library API to applications. If
you just want to use it internally to dpdk, thats fine.

> >b) Hugepages are a resource that are very specific to Linux, and to X86 Linux at
> >that.  Some OS implement simmilar resources, but they may have very different
> >semantics.  And other Arches may or may not implement various forms of compound
> >paging at all.  As the DPDK expands to support more OS'es and arches, it would
> >be nice to ensure that the programming surfaces that you expose have a more
> >broad level of support.
> 
> That's why I put current implement in lib/librte_eal/linuxapp/. And the new
> API uses the words of cores and memory, which is very generic IMO. In
> Linux's context, memory is interpreted into hugepages (maybe not correct
> because DPDK can be used with 4K memory). For other OSes, we could add
> similar limitation in their semantics.
> 
> 
> Thanks,
> Jianfeng
> 
> >
> >Neil
> >
> >>Thanks,
> >>Jianfeng
> >>
> >>>Neil
> >>>
> >>
> 
>
  

Patch

diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index 29942ea..7235473 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@  eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_SELF_DISCOVERY,    1, NULL, OPT_SELF_DISCOVERY_NUM   },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -128,6 +129,7 @@  eal_reset_internal_config(struct internal_config *internal_cfg)
 	internal_cfg->force_nchannel = 0;
 	internal_cfg->hugefile_prefix = HUGEFILE_PREFIX_DEFAULT;
 	internal_cfg->hugepage_dir = NULL;
+	internal_cfg->self_discovery = NULL;
 	internal_cfg->force_sockets = 0;
 	/* zero out the NUMA config */
 	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
@@ -755,6 +757,24 @@  eal_parse_proc_type(const char *arg)
 }
 
 int
+__attribute__((weak))
+rte_eal_res_self_discovery(const char *type __rte_unused,
+			   char **p_corelist __rte_unused,
+			   uint64_t *p_memory __rte_unused)
+{
+	return -1;
+}
+
+int
+__attribute__((weak))
+rte_eal_res_self_discovery_apply(const char *type __rte_unused,
+				 int enable_core __rte_unused,
+				 int enable_mem __rte_unused)
+{
+	return -1;
+}
+
+int
 eal_parse_common_option(int opt, const char *optarg,
 			struct internal_config *conf)
 {
@@ -897,6 +917,25 @@  eal_parse_common_option(int opt, const char *optarg,
 		}
 		break;
 
+	case OPT_SELF_DISCOVERY_NUM: {
+		char *corelist;
+
+		if (rte_eal_res_self_discovery(optarg, &corelist, NULL) < 0) {
+			RTE_LOG(ERR, EAL, "invalid parameter for --"
+				OPT_SELF_DISCOVERY "\n");
+			return -1;
+		}
+
+		if (eal_parse_corelist(corelist) < 0) {
+			RTE_LOG(ERR, EAL, "invalid core list\n");
+			return -1;
+		}
+		/* Save it here for memory limit */
+		internal_config.self_discovery = strdup(optarg);
+
+		break;
+	}
+
 	/* don't know what to do, leave this to caller */
 	default:
 		return 1;
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..f3c8e31 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -83,6 +83,7 @@  struct internal_config {
 	volatile enum rte_intr_mode vfio_intr_mode;
 	const char *hugefile_prefix;      /**< the base filename of hugetlbfs files */
 	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
+	const char *self_discovery;       /**< specific type of self_discovery */
 
 	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
 	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62..a499d73 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@  enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_SELF_DISCOVERY    "self-discovery"
+	OPT_SELF_DISCOVERY_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/common/include/rte_eal.h b/lib/librte_eal/common/include/rte_eal.h
index d2816a8..ff81484 100644
--- a/lib/librte_eal/common/include/rte_eal.h
+++ b/lib/librte_eal/common/include/rte_eal.h
@@ -220,6 +220,40 @@  int rte_eal_has_hugepages(void);
 int rte_sys_gettid(void);
 
 /**
+ * An API to query resource self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @p_corelist
+ *   If succeed, fill core list which can be used. Caller to free.
+ * @p_memory
+ *   If succeed, fill how many (bytes) memory can be used.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery(const char *type,
+			       char **p_corelist, uint64_t *p_memory);
+/**
+ * An API to apply resource through self discovery.
+ *
+ * @type
+ *   Type of self resource discovery.
+ * @enable_core
+ *   If succeed, apply core resource.
+ * @p_memory
+ *   If succeed, apply memory resource.
+ *
+ * @return
+ *   - (-1), if failed.
+ *   - 0, if succeed.
+ */
+int rte_eal_res_self_discovery_apply(const char *type,
+				     int enable_core, int enable_mem);
+
+
+/**
  * Get system unique thread id.
  *
  * @return
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 26eced5..834ae2f 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -87,6 +87,7 @@  SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_devargs.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_dev.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_options.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_common_thread.c
+SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += eal_cgroup.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += rte_malloc.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_elem.c
 SRCS-$(CONFIG_RTE_LIBRTE_EAL_LINUXAPP) += malloc_heap.c
diff --git a/lib/librte_eal/linuxapp/eal/eal_cgroup.c b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
new file mode 100644
index 0000000..d6a04ee
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_cgroup.c
@@ -0,0 +1,294 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/file.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <mntent.h>
+#include <inttypes.h>
+
+#include <rte_log.h>
+#include <rte_eal.h>
+#include <rte_common.h>
+
+#include "eal_internal_cfg.h"
+
+static int pid;
+
+static char *
+find_controller_dfs(const char *dir_path)
+{
+	FILE *f;
+	char *line;
+	char *ret;
+	size_t len;
+	ssize_t read;
+	DIR *dir;
+	struct dirent *ent;
+	char filepath[PATH_MAX];
+
+	// 1. check if this process belongs to this cgroup
+	snprintf(filepath, sizeof(filepath)-1, "%s/tasks", dir_path);
+	f = fopen(filepath, "r");
+	if (f == NULL)
+		return NULL;
+	len = 0;
+	line = NULL;
+	while ((read = getline(&line, &len, f)) != -1) {
+		int _pid = atoi(line);
+		free(line);
+		if (_pid == pid)
+			break;
+		len = 0;
+		line = NULL;
+	}
+	fclose(f);
+	if (read != -1)
+		return strdup(dir_path);
+
+	// 2. check its child cgroup
+	if (!(dir = opendir(dir_path)))
+		return NULL;
+
+	ret = NULL;
+	while ((ent = readdir(dir)) != NULL) {
+		if (ent->d_type != DT_DIR)
+			continue;
+		if (strcmp(ent->d_name, ".") == 0 ||
+		    strcmp(ent->d_name, "..") == 0)
+			continue;
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 dir_path, ent->d_name);
+
+		ret = find_controller_dfs(filepath);
+		if (ret != NULL)
+			break;
+	}
+
+	closedir(dir);
+	return ret;
+}
+
+static char *
+find_controller(const char *controller)
+{
+	FILE *f;
+	char *path;
+	struct mntent *ent;
+
+	static const char *proc_mounts = "/proc/mounts";
+	static const char *fs_type = "cgroup";
+
+	f = setmntent(proc_mounts, "r");
+	if (f == NULL) {
+		RTE_LOG(ERR, EAL, "Cannot open %s\n", proc_mounts);
+		return NULL;
+	}
+
+	while (NULL != (ent = getmntent(f))) {
+		if (strcmp(ent->mnt_type, fs_type) != 0)
+			continue;
+		if (hasmntopt(ent, controller) == NULL)
+			continue;
+		break;
+	}
+
+	if (ent == NULL) {
+		path = NULL;
+		goto end;
+	}
+
+	path = find_controller_dfs(ent->mnt_dir);
+end:
+	endmntent(f);
+	return path;
+}
+
+static inline char *
+get_oneline_from_file(const char *path)
+{
+	FILE *f;
+	char *line = NULL;
+	size_t len = 0;
+
+	if (NULL == (f = fopen(path, "r")))
+		return NULL;
+	if (getline(&line, &len, f) == -1)
+		line = NULL;
+	line[strcspn(line, "\n")] = 0;
+	fclose(f);
+	return line;
+}
+
+static int
+cgroup_cpuset(char **p_corelist, int enable __rte_unused)
+{
+	char filepath[PATH_MAX];
+	char *controller;
+
+       	controller = find_controller("cpuset");
+	if (controller == NULL)
+		return -1;
+
+	snprintf(filepath, sizeof(filepath)-1, "%s/cpuset.cpus", controller);
+	*p_corelist = get_oneline_from_file(filepath);
+	RTE_LOG(INFO, EAL, "cgroup cpuset: %s\n", *p_corelist);
+	return 0;
+
+}
+
+static inline uint64_t
+get_hugetlb_limit(const char *path)
+{
+	uint64_t limit;
+	char *str;
+
+       	str = get_oneline_from_file(path);
+	sscanf(str, "%"PRIu64, &limit);
+	free(str);
+	return limit;
+}
+
+static int
+cgroup_hugetlb(uint64_t *p_memory, int enable)
+{
+	unsigned i;
+	char filepath[PATH_MAX];
+	char *controller;
+	DIR *dir;
+	struct dirent *ent;
+	uint64_t memory = 0;
+	static char prefix[] = "hugetlb";
+	static int prefix_len = sizeof(prefix) - 1;
+	static char suffix[] = "limit_in_bytes";
+
+       	controller = find_controller("hugetlb");
+	if (controller == NULL)
+		return -1;
+
+	if (!(dir = opendir(controller)))
+		return -1;
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (strncmp(ent->d_name, prefix, prefix_len) != 0)
+			continue;
+
+		char *sz_beg = ent->d_name + prefix_len + 1;
+		char *sz_end = strchr(sz_beg, '.');
+
+		if (strcmp(sz_end + 1, suffix) != 0)
+			continue;
+
+		char *tmp = strndup(sz_beg, sz_end - sz_beg);
+		uint64_t pagesize = rte_str_to_size(tmp);
+		free(tmp);
+
+		snprintf(filepath, sizeof(filepath)-1, "%s/%s",
+			 controller, ent->d_name);
+		uint64_t m_limit = get_hugetlb_limit(filepath);
+		memory += m_limit;
+
+		/* Record those information into internal_config if hugepages
+		 * are already initialized.
+		 */
+		if (! enable)
+			continue;
+		for (i = 0; i < internal_config.num_hugepage_sizes; ++i) {
+			struct hugepage_info *hp;
+
+		       	hp = &internal_config.hugepage_info[i];
+			if (hp->hugepage_sz != pagesize)
+				continue;
+
+			if (m_limit < hp->hugepage_sz * hp->num_pages[0])
+				hp->num_pages[0] = m_limit / hp->hugepage_sz;
+		}
+	}
+
+	closedir(dir);
+	*p_memory = memory;
+	RTE_LOG(INFO, EAL, "cgroup hugetlb: %"PRIx64"\n", *p_memory);
+	return 0;
+}
+
+static int
+resource_self_discovery(const char *type, char **p_corelist, int enable_core,
+			uint64_t *p_memory, int enable_mem)
+{
+	if (strcmp(type, "cgroup") != 0) {
+		RTE_LOG(ERR, EAL, "type not supported: %s\n", type);
+		return -1;
+	}
+
+	pid = getpid();
+
+	if (p_corelist != NULL && cgroup_cpuset(p_corelist, enable_core) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource cpuset\n");
+		return -1;
+	}
+	if (p_memory != NULL && cgroup_hugetlb(p_memory, enable_mem) < 0) {
+		RTE_LOG(ERR, EAL, "Failed when discover resource hugetlb\n");
+		return -1;
+	}
+
+	return 0;
+}
+
+int
+rte_eal_res_self_discovery(const char *type, char **p_corelist,
+			   uint64_t *p_memory)
+{
+	return resource_self_discovery(type, p_corelist, 0, p_memory, 0);
+}
+
+int
+rte_eal_res_self_discovery_apply(const char *type, int enable_core,
+				 int enable_mem)
+{
+	char *corelist, **pc = NULL;
+	uint64_t mem, *pm = NULL;
+	
+	if (enable_core)
+		pc = &corelist;
+	if (enable_mem)
+		pm = &mem;
+
+	return resource_self_discovery(type, pc, enable_core,
+				       pm, enable_mem);
+}
diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 18858e2..a6b6548 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -354,6 +354,11 @@  eal_hugepage_info_init(void)
 	qsort(&internal_config.hugepage_info[0], num_sizes,
 	      sizeof(internal_config.hugepage_info[0]), compare_hpi);
 
+	/* Apply cgroup hugetlb limit before we really use hugepages */
+	if (internal_config.self_discovery)
+		rte_eal_res_self_discovery_apply(internal_config.self_discovery,
+						 0, 1);
+
 	/* now we have all info, check we have at least one valid size */
 	for (i = 0; i < num_sizes; i++)
 		if (internal_config.hugepage_info[i].hugedir != NULL &&