[dpdk-dev,v2] arm: detect NEON by RTE_MACHINE_CPUFLAG_NEON flag only
Commit Message
The RTE_MACHINE_CPUFLAG_NEON was only a result of the gcc testing. However,
the target CPU may not support NEON or the user can disable to use it (as it
does not always improve the performance).
The RTE_MACHINE_CPUFLAG_NEON detection is now based on both, the __ARM_NEON_FP
feature from gcc and CONFIG_RTE_ARCH_ARM_NEON from the .config. The memcpy
implemention is driven by RTE_MACHINE_CPUFLAG_NEON, so the reason to disable
NEON is hidden for the actual code.
Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
---
v2: fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
---
examples/l3fwd/l3fwd_em.c | 2 +-
lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 4 ++--
mk/machine/armv7a/rte.vars.mk | 2 +-
mk/rte.cpuflags.mk | 2 ++
4 files changed, 6 insertions(+), 4 deletions(-)
Comments
On Sat, 19 Mar 2016 10:26:30 +0100
Jan Viktorin <viktorin@rehivetech.com> wrote:
> The RTE_MACHINE_CPUFLAG_NEON was only a result of the gcc testing. However,
> the target CPU may not support NEON or the user can disable to use it (as it
> does not always improve the performance).
>
> The RTE_MACHINE_CPUFLAG_NEON detection is now based on both, the __ARM_NEON_FP
> feature from gcc and CONFIG_RTE_ARCH_ARM_NEON from the .config. The memcpy
> implemention is driven by RTE_MACHINE_CPUFLAG_NEON, so the reason to disable
> NEON is hidden for the actual code.
Unfortunately, I've overlooked a mistake. I have to remake the patch a
bit, sorry. I am a bit confused about the __ARM_NEON and __ARM_NEON_FP
settings.
The arm_neon.h is available only when the __ARM_NEON is present. But...
$ arm-buildroot-linux-gnueabi-gcc -dM -E - < /dev/null | grep "_FP\|_NEON"
#define __ARM_FP 12
#define __ARM_NEON_FP 4
#define __VFP_FP__ 1
Without -mfpu=neon we don't have arm_neon.h. I consider this strange as
we are not interested in the FPU features but in the SIMD features...
$ arm-buildroot-linux-gnueabi-gcc -mfpu=neon -dM -E - < /dev/null | grep "_FP\|_NEON"
#define __ARM_FP 12
#define __ARM_NEON_FP 4
#define __ARM_NEON__ 1
#define __VFP_FP__ 1
#define __ARM_NEON 1
$ arm-buildroot-linux-gnueabi-gcc -mfpu=neon-vfpv4 -dM -E - < /dev/null | grep "_FP\|_NEON"
#define __ARM_FP 14
#define __ARM_NEON_FP 6
#define __FP_FAST_FMAF 1
#define __FP_FAST_FMAL 1
#define __ARM_NEON__ 1
#define __VFP_FP__ 1
#define __ARM_NEON 1
#define __FP_FAST_FMA 1
ARM64 is OK here...
$ aarch64-buildroot-linux-gnu-gcc -dM -E - < /dev/null | grep "NEON\|FP"
#define __FP_FAST_FMAF 1
#define __ARM_NEON 1
#define __FP_FAST_FMA 1
So...
>
> Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
> v2: fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
> ---
> examples/l3fwd/l3fwd_em.c | 2 +-
> lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 4 ++--
> mk/machine/armv7a/rte.vars.mk | 2 +-
> mk/rte.cpuflags.mk | 2 ++
> 4 files changed, 6 insertions(+), 4 deletions(-)
>
[...]
> #ifdef __cplusplus
> }
> diff --git a/mk/machine/armv7a/rte.vars.mk b/mk/machine/armv7a/rte.vars.mk
> index 48d3979..7a167c1 100644
> --- a/mk/machine/armv7a/rte.vars.mk
> +++ b/mk/machine/armv7a/rte.vars.mk
> @@ -62,6 +62,6 @@ ifdef CONFIG_RTE_ARCH_ARM_TUNE
> MACHINE_CFLAGS += -mtune=$(CONFIG_RTE_ARCH_ARM_TUNE)
> endif
>
> -ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
> +ifdef $(RTE_MACHINE_CPUFLAG_NEON)
> MACHINE_CFLAGS += -mfpu=neon
> endif
RTE_MACHINE_CPUFLAG_NEON is not *yet* set here (cpuflags are detected later)...
So the -mfpu=neon is never configured and the build fails. The
MACHINE_CFLAGS should rather depend on the CONFIG_RTE_ARCH_ARM_NEON
telling the build-system "we want NEON".
> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index 19a3e7e..1947511 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -111,9 +111,11 @@ CPUFLAGS += VSX
> endif
>
> # ARM flags
> +ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
> ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_NEON_FP),)
Here, we should check __ARM_NEON (to be ARM32/64 compatible) but we
cannot see __ARM_NEON without the -mfpu=neon flag.
Jerin, does the current DPDK detect NEON feature on ARM64? I'd say, it
cannot.
So, we should probably check both __ARM_NEON and __ARM_NEON_FP here.
Another point, related to the original discussion:
http://dpdk.org/ml/archives/dev/2016-March/thread.html#35972
we should probably have a config option to enable memcpy optimizations
separated from the NEON support. The NEON support can then be detected
only by the __ARM_NEON flag. The ARMv7 would have the -mfpu=neon always
set. If somebody likes to customize this, she would do it by hand. The
result is, we correctly detect NEON during build time from the GCC.
> CPUFLAGS += NEON
> endif
> +endif
>
> ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
> CPUFLAGS += CRC32
Hello,
finally, I've broken the original patch into 4 pieces as it solves more issues
and not just a single one.
* As Thomas have already mentioned, the CONFIG_RTE_ARCH_ARM_NEON is confusing.
So, I've decided to remove it entirely and provide another option for a more
specific purpose: CONFIG_RTE_ARCH_ARM_NEON_MEMCPY.
* The RTE_MACHINE_CPUFLAG_NEON detection is now based on __ARM_NEON as only
this compiler definition gives us the arm_neon.h and is compatible with
arm64. In DPDK, the RTE_MACHINE_CPUFLAG_NEON should be prefered over the
__ARM_NEON. I'd recommend the same for x86 code (__SSE2__)...
History:
v2
* fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
v3
* divided into 4 patches as there are more independent problems
* compiles well for armv7
* (probably) fixes RTE_MACHINE_CPUFLAG_NEON detection on arm64
Jan Viktorin (4):
arm: remove CONFIG_RTE_ARCH_ARM_NEON
arm: detect NEON cpu feature by checking __ARM_NEON
arm: detect NEON by checking RTE_MACHINE_CPUFLAG_NEON
eal/arm: introduce CONFIG_RTE_ARCH_ARM_NEON_MEMCPY
config/defconfig_arm-armv7a-linuxapp-gcc | 2 +-
config/defconfig_arm64-armv8a-linuxapp-gcc | 1 -
examples/l3fwd/l3fwd_em.c | 2 +-
lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 8 ++++++--
mk/machine/armv7a/rte.vars.mk | 2 --
mk/rte.cpuflags.mk | 2 +-
6 files changed, 9 insertions(+), 8 deletions(-)
2016-03-19 20:58, Jan Viktorin:
> Hello,
>
> finally, I've broken the original patch into 4 pieces as it solves more issues
> and not just a single one.
>
> * As Thomas have already mentioned, the CONFIG_RTE_ARCH_ARM_NEON is confusing.
> So, I've decided to remove it entirely and provide another option for a more
> specific purpose: CONFIG_RTE_ARCH_ARM_NEON_MEMCPY.
>
> * The RTE_MACHINE_CPUFLAG_NEON detection is now based on __ARM_NEON as only
> this compiler definition gives us the arm_neon.h and is compatible with
> arm64. In DPDK, the RTE_MACHINE_CPUFLAG_NEON should be prefered over the
> __ARM_NEON. I'd recommend the same for x86 code (__SSE2__)...
>
> History:
> v2
> * fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
>
> v3
> * divided into 4 patches as there are more independent problems
> * compiles well for armv7
> * (probably) fixes RTE_MACHINE_CPUFLAG_NEON detection on arm64
Applied with discussed changes.
@@ -250,7 +250,7 @@ em_mask_key(void *key, xmm_t mask)
return _mm_and_si128(data, mask);
}
-#elif defined(__ARM_NEON)
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
static inline xmm_t
em_mask_key(void *key, xmm_t mask)
{
@@ -42,7 +42,7 @@ extern "C" {
#include "generic/rte_memcpy.h"
-#ifdef __ARM_NEON_FP
+#ifdef RTE_MACHINE_CPUFLAG_NEON
/* ARM NEON Intrinsics are used to copy data */
#include <arm_neon.h>
@@ -325,7 +325,7 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
return memcpy(dst, src, n);
}
-#endif /* __ARM_NEON_FP */
+#endif /* RTE_MACHINE_CPUFLAG_NEON */
#ifdef __cplusplus
}
@@ -62,6 +62,6 @@ ifdef CONFIG_RTE_ARCH_ARM_TUNE
MACHINE_CFLAGS += -mtune=$(CONFIG_RTE_ARCH_ARM_TUNE)
endif
-ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
+ifdef $(RTE_MACHINE_CPUFLAG_NEON)
MACHINE_CFLAGS += -mfpu=neon
endif
@@ -111,9 +111,11 @@ CPUFLAGS += VSX
endif
# ARM flags
+ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_NEON_FP),)
CPUFLAGS += NEON
endif
+endif
ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
CPUFLAGS += CRC32