[dpdk-dev,v2] arm: detect NEON by RTE_MACHINE_CPUFLAG_NEON flag only

Message ID 1458379590-18618-1-git-send-email-viktorin@rehivetech.com (mailing list archive)
State Changes Requested, archived
Headers

Commit Message

Jan Viktorin March 19, 2016, 9:26 a.m. UTC
  The RTE_MACHINE_CPUFLAG_NEON was only a result of the gcc testing. However,
the target CPU may not support NEON or the user can disable to use it (as it
does not always improve the performance).

The RTE_MACHINE_CPUFLAG_NEON detection is now based on both, the __ARM_NEON_FP
feature from gcc and CONFIG_RTE_ARCH_ARM_NEON from the .config. The memcpy
implemention is driven by RTE_MACHINE_CPUFLAG_NEON, so the reason to disable
NEON is hidden for the actual code.

Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
---
v2: fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
---
 examples/l3fwd/l3fwd_em.c                              | 2 +-
 lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 4 ++--
 mk/machine/armv7a/rte.vars.mk                          | 2 +-
 mk/rte.cpuflags.mk                                     | 2 ++
 4 files changed, 6 insertions(+), 4 deletions(-)
  

Comments

Jan Viktorin March 19, 2016, 11:05 a.m. UTC | #1
On Sat, 19 Mar 2016 10:26:30 +0100
Jan Viktorin <viktorin@rehivetech.com> wrote:

> The RTE_MACHINE_CPUFLAG_NEON was only a result of the gcc testing. However,
> the target CPU may not support NEON or the user can disable to use it (as it
> does not always improve the performance).
> 
> The RTE_MACHINE_CPUFLAG_NEON detection is now based on both, the __ARM_NEON_FP
> feature from gcc and CONFIG_RTE_ARCH_ARM_NEON from the .config. The memcpy
> implemention is driven by RTE_MACHINE_CPUFLAG_NEON, so the reason to disable
> NEON is hidden for the actual code.

Unfortunately, I've overlooked a mistake. I have to remake the patch a
bit, sorry. I am a bit confused about the __ARM_NEON and __ARM_NEON_FP
settings.

The arm_neon.h is available only when the __ARM_NEON is present. But...

$ arm-buildroot-linux-gnueabi-gcc -dM -E - < /dev/null  | grep "_FP\|_NEON"
#define __ARM_FP 12
#define __ARM_NEON_FP 4
#define __VFP_FP__ 1

Without -mfpu=neon we don't have arm_neon.h. I consider this strange as
we are not interested in the FPU features but in the SIMD features...

$ arm-buildroot-linux-gnueabi-gcc -mfpu=neon -dM -E - < /dev/null  | grep "_FP\|_NEON"
#define __ARM_FP 12
#define __ARM_NEON_FP 4
#define __ARM_NEON__ 1
#define __VFP_FP__ 1
#define __ARM_NEON 1

$ arm-buildroot-linux-gnueabi-gcc -mfpu=neon-vfpv4 -dM -E - < /dev/null  | grep "_FP\|_NEON"
#define __ARM_FP 14
#define __ARM_NEON_FP 6
#define __FP_FAST_FMAF 1
#define __FP_FAST_FMAL 1
#define __ARM_NEON__ 1
#define __VFP_FP__ 1
#define __ARM_NEON 1
#define __FP_FAST_FMA 1

ARM64 is OK here...

$ aarch64-buildroot-linux-gnu-gcc -dM -E - < /dev/null | grep "NEON\|FP"
#define __FP_FAST_FMAF 1
#define __ARM_NEON 1
#define __FP_FAST_FMA 1

So...

> 
> Signed-off-by: Jan Viktorin <viktorin@rehivetech.com>
> ---
> v2: fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
> ---
>  examples/l3fwd/l3fwd_em.c                              | 2 +-
>  lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 4 ++--
>  mk/machine/armv7a/rte.vars.mk                          | 2 +-
>  mk/rte.cpuflags.mk                                     | 2 ++
>  4 files changed, 6 insertions(+), 4 deletions(-)
> 
[...]
>  #ifdef __cplusplus
>  }
> diff --git a/mk/machine/armv7a/rte.vars.mk b/mk/machine/armv7a/rte.vars.mk
> index 48d3979..7a167c1 100644
> --- a/mk/machine/armv7a/rte.vars.mk
> +++ b/mk/machine/armv7a/rte.vars.mk
> @@ -62,6 +62,6 @@ ifdef CONFIG_RTE_ARCH_ARM_TUNE
>  MACHINE_CFLAGS += -mtune=$(CONFIG_RTE_ARCH_ARM_TUNE)
>  endif
>  
> -ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
> +ifdef $(RTE_MACHINE_CPUFLAG_NEON)
>  MACHINE_CFLAGS += -mfpu=neon
>  endif

RTE_MACHINE_CPUFLAG_NEON is not *yet* set here (cpuflags are detected later)...
So the -mfpu=neon is never configured and the build fails. The
MACHINE_CFLAGS should rather depend on the CONFIG_RTE_ARCH_ARM_NEON
telling the build-system "we want NEON".

> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index 19a3e7e..1947511 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -111,9 +111,11 @@ CPUFLAGS += VSX
>  endif
>  
>  # ARM flags
> +ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
>  ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_NEON_FP),)

Here, we should check __ARM_NEON (to be ARM32/64 compatible) but we
cannot see __ARM_NEON without the -mfpu=neon flag.

Jerin, does the current DPDK detect NEON feature on ARM64? I'd say, it
cannot.

So, we should probably check both __ARM_NEON and __ARM_NEON_FP here.

Another point, related to the original discussion:

http://dpdk.org/ml/archives/dev/2016-March/thread.html#35972

we should probably have a config option to enable memcpy optimizations
separated from the NEON support. The NEON support can then be detected
only by the __ARM_NEON flag. The ARMv7 would have the -mfpu=neon always
set. If somebody likes to customize this, she would do it by hand. The
result is, we correctly detect NEON during build time from the GCC.

>  CPUFLAGS += NEON
>  endif
> +endif
>  
>  ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
>  CPUFLAGS += CRC32
  
Jan Viktorin March 19, 2016, 7:58 p.m. UTC | #2
Hello,

finally, I've broken the original patch into 4 pieces as it solves more issues
and not just a single one.

* As Thomas have already mentioned, the CONFIG_RTE_ARCH_ARM_NEON is confusing. 
  So, I've decided to remove it entirely and provide another option for a more
  specific purpose: CONFIG_RTE_ARCH_ARM_NEON_MEMCPY.

* The RTE_MACHINE_CPUFLAG_NEON detection is now based on __ARM_NEON as only
  this compiler definition gives us the arm_neon.h and is compatible with
  arm64. In DPDK, the RTE_MACHINE_CPUFLAG_NEON should be prefered over the
  __ARM_NEON. I'd recommend the same for x86 code (__SSE2__)... 

History:
v2
* fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON

v3
* divided into 4 patches as there are more independent problems
* compiles well for armv7
* (probably) fixes RTE_MACHINE_CPUFLAG_NEON detection on arm64

Jan Viktorin (4):
  arm: remove CONFIG_RTE_ARCH_ARM_NEON
  arm: detect NEON cpu feature by checking __ARM_NEON
  arm: detect NEON by checking RTE_MACHINE_CPUFLAG_NEON
  eal/arm: introduce CONFIG_RTE_ARCH_ARM_NEON_MEMCPY

 config/defconfig_arm-armv7a-linuxapp-gcc               | 2 +-
 config/defconfig_arm64-armv8a-linuxapp-gcc             | 1 -
 examples/l3fwd/l3fwd_em.c                              | 2 +-
 lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h | 8 ++++++--
 mk/machine/armv7a/rte.vars.mk                          | 2 --
 mk/rte.cpuflags.mk                                     | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)
  
Thomas Monjalon March 24, 2016, 4:47 p.m. UTC | #3
2016-03-19 20:58, Jan Viktorin:
> Hello,
> 
> finally, I've broken the original patch into 4 pieces as it solves more issues
> and not just a single one.
> 
> * As Thomas have already mentioned, the CONFIG_RTE_ARCH_ARM_NEON is confusing. 
>   So, I've decided to remove it entirely and provide another option for a more
>   specific purpose: CONFIG_RTE_ARCH_ARM_NEON_MEMCPY.
> 
> * The RTE_MACHINE_CPUFLAG_NEON detection is now based on __ARM_NEON as only
>   this compiler definition gives us the arm_neon.h and is compatible with
>   arm64. In DPDK, the RTE_MACHINE_CPUFLAG_NEON should be prefered over the
>   __ARM_NEON. I'd recommend the same for x86 code (__SSE2__)... 
> 
> History:
> v2
> * fix l3fwm_em.c to refer RTE_MACHINE_CPUFLAG_NEON instead of __ARM_NEON
> 
> v3
> * divided into 4 patches as there are more independent problems
> * compiles well for armv7
> * (probably) fixes RTE_MACHINE_CPUFLAG_NEON detection on arm64

Applied with discussed changes.
  

Patch

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 0adf8f4..4983eed 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -250,7 +250,7 @@  em_mask_key(void *key, xmm_t mask)
 
 	return _mm_and_si128(data, mask);
 }
-#elif defined(__ARM_NEON)
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
 static inline xmm_t
 em_mask_key(void *key, xmm_t mask)
 {
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
index df47c0d..ad8bc65 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_32.h
@@ -42,7 +42,7 @@  extern "C" {
 
 #include "generic/rte_memcpy.h"
 
-#ifdef __ARM_NEON_FP
+#ifdef RTE_MACHINE_CPUFLAG_NEON
 
 /* ARM NEON Intrinsics are used to copy data */
 #include <arm_neon.h>
@@ -325,7 +325,7 @@  rte_memcpy_func(void *dst, const void *src, size_t n)
 	return memcpy(dst, src, n);
 }
 
-#endif /* __ARM_NEON_FP */
+#endif /* RTE_MACHINE_CPUFLAG_NEON */
 
 #ifdef __cplusplus
 }
diff --git a/mk/machine/armv7a/rte.vars.mk b/mk/machine/armv7a/rte.vars.mk
index 48d3979..7a167c1 100644
--- a/mk/machine/armv7a/rte.vars.mk
+++ b/mk/machine/armv7a/rte.vars.mk
@@ -62,6 +62,6 @@  ifdef CONFIG_RTE_ARCH_ARM_TUNE
 MACHINE_CFLAGS += -mtune=$(CONFIG_RTE_ARCH_ARM_TUNE)
 endif
 
-ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
+ifdef $(RTE_MACHINE_CPUFLAG_NEON)
 MACHINE_CFLAGS += -mfpu=neon
 endif
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 19a3e7e..1947511 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -111,9 +111,11 @@  CPUFLAGS += VSX
 endif
 
 # ARM flags
+ifeq ($(CONFIG_RTE_ARCH_ARM_NEON),y)
 ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_NEON_FP),)
 CPUFLAGS += NEON
 endif
+endif
 
 ifneq ($(filter $(AUTO_CPUFLAGS),__ARM_FEATURE_CRC32),)
 CPUFLAGS += CRC32