Alfred 负责中后端对接 zhangjing前端
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。
William和我讨论过后, 造了一个MapleIR Example:
int32x4_t func1() {
int32x4_t in;
int32_t i = 5;
in = vdupq_n_s32(i);
return in;
}
Would something like this suffice for us to do first?
func &func1 () v4i32 {
var %in v4i32
var %i i32
dassign %i 0 (constval i32 5)
intrinsiccallassigned vdupq_n_s32 (dread i32 %i) { regassign v4i32 %in}
return (regread v4i32 %in)
}
还有,arm_neon.h 里使用的类型除了:
int32x4_t
uint64_t
uint64x1_t
uint64x2_t
uint8_t
uint8x16_t
应该还有:
uint32x2_t - vget_lane_u32()
uint16x8_t - vpaddlq_u16()
uint32x4_t - vpaddlq_u32()
最后, uint64_t 和 uint64x1_t 分别是什么?那个涵数会用 uint64x1_t? 谢谢!
sorry,遗漏了一些
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; // vpaddlq_u16
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; // vpaddlq_u16
uint64x1_t在如下函数有使用到:
uint64x1_t vget_high_u64(uint64x2_t __p0)
uint64x1_t vget_low_u64(uint64x2_t __p0)
请问
typedef attribute((neon_vector_type(2))) uint32_t uint32x2_t;
typedef attribute((neon_vector_type(2))) int32_t int32x2_t;
typedef attribute((neon_vector_type(1))) uint64_t uint64x1_t;
是 128bits 吗? Upper 64bits to be left out.
Currently MapleIR specifies 128bits vector types only, no 64bit yet. 这三个类型MapleIR 现在没有支持。
我对上述提议基本函数有一点问题:
int32x4_t vdupq_n_s32 (int32_t value)
uint8x16_t vdupq_n_u8 (uint8_t value)
uint8x16_t vextq_u8 (uint8x16_t a, uint8x16_t b, const int n)
uint32x2_t vget_high_u32 (uint32x4_t a)
uint32_t vget_lane_u32 (uint32x2_t v, const int lane)
uint64x1_t vget_low_u64 (uint64x2_t a)
uint32_t vgetq_lane_u32 (uint32x4_t v, const int lane)
uint8x16_t vld1q_u8 (uint8_t const * ptr)
uint32x4_t vpaddlq_u16 (uint16x8_t a)
uint64x2_t vpaddlq_u32 (uint32x4_t a)
uint64x2_t vreinterpretq_u64_u8 (uint8x16_t a)
uint8x16_t vrev32q_u8 (uint8x16_t vec)
uint32x4_t vsetq_lane_u32 (uint32_t a, uint32x4_t v, const int lane)
void vst1q_s32 (int32_t * ptr, int32x4_t val)
void vst1q_u8 (uint8_t * ptr, uint8x16_t val)
uint8x16_t 只在 vduq_n_s32() or vst1q_s32 用到,only create vector and store?
uint16x8_t 只是vpaddlq_u16() 里的argument, how was it created?
我们现在要先在mplcg加Q-register support and it calling convention support. 然后才能加Neon intrinsics.
抽取出gcc版的最小集arm_neon.h,替换后可在业务中正常编译
#ifndef _AARCH64_NEON_H_
#define _AARCH64_NEON_H_
#pragma GCC push_options
#pragma GCC target ("+nothing+simd")
#include <stdint.h>
#define __AARCH64_UINT64_C(__C) ((uint64_t) __C)
#define __AARCH64_INT64_C(__C) ((int64_t) __C)
typedef __Int32x2_t int32x2_t;
typedef __Uint32x2_t uint32x2_t;
typedef __Uint64x1_t uint64x1_t;
typedef __Int8x16_t int8x16_t;
typedef __Int32x4_t int32x4_t;
typedef __Uint32x4_t uint32x4_t;
typedef __Uint8x16_t uint8x16_t;
typedef __Uint16x8_t uint16x8_t;
typedef __Uint64x2_t uint64x2_t;
/* For big-endian, GCC's vector indices are the opposite way around
to the architectural lane indices used by Neon intrinsics. */
#ifdef __AARCH64EB__
#define __aarch64_lane(__vec, __idx) (__AARCH64_NUM_LANES (__vec) - 1 - __idx)
#else
#define __aarch64_lane(__vec, __idx) __idx
#endif
/* vget_lane internal macro. */
#define __aarch64_vget_lane_any(__vec, __index) \
__extension__ \
({ \
__AARCH64_LANE_CHECK (__vec, __index); \
__vec[__aarch64_lane (__vec, __index)]; \
})
/* vset_lane and vld1_lane internal macro. */
#define __aarch64_vset_lane_any(__elem, __vec, __index) \
__extension__ \
({ \
__AARCH64_LANE_CHECK (__vec, __index); \
__vec[__aarch64_lane (__vec, __index)] = __elem; \
__vec; \
})
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdupq_n_s32 (int32_t __a)
{
return (int32x4_t) {__a, __a, __a, __a};
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vdupq_n_u8 (uint32_t __a)
{
return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vextq_u8 (uint8x16_t __a, uint8x16_t __b, __const int __c)
{
__AARCH64_LANE_CHECK (__a, __c);
#ifdef __AARCH64EB__
return __builtin_shuffle (__b, __a, (uint8x16_t)
{16-__c, 17-__c, 18-__c, 19-__c, 20-__c, 21-__c, 22-__c, 23-__c,
24-__c, 25-__c, 26-__c, 27-__c, 28-__c, 29-__c, 30-__c, 31-__c});
#else
return __builtin_shuffle (__a, __b, (uint8x16_t)
{__c, __c+1, __c+2, __c+3, __c+4, __c+5, __c+6, __c+7,
__c+8, __c+9, __c+10, __c+11, __c+12, __c+13, __c+14, __c+15});
#endif
}
__extension__ extern __inline uint64x1_t // dependent
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcreate_u64 (uint64_t __a)
{
return (uint64x1_t) {__a};
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_high_u64 (uint64x2_t __a)
{
return vcreate_u64 (vgetq_lane_u64 (__a, 1));
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_lane_u32 (uint32x2_t __a, const int __b)
{
return __aarch64_vget_lane_any (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vget_low_u64 (uint64x2_t __a)
{
return vcreate_u64 (vgetq_lane_u64 (__a, 0));
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vgetq_lane_u32 (uint32x4_t __a, const int __b)
{
return __aarch64_vget_lane_any (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vld1q_u8 (const uint8_t *a)
{
return (uint8x16_t)
__builtin_aarch64_ld1v16qi ((const __builtin_aarch64_simd_qi *) a);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddlq_u16 (uint16x8_t a)
{
uint32x4_t result;
__asm__ ("uaddlp %0.4s,%1.8h"
: "=w"(result)
: "w"(a)
: /* No clobbers */);
return result;
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpaddlq_u32 (uint32x4_t a)
{
uint64x2_t result;
__asm__ ("uaddlp %0.2d,%1.4s"
: "=w"(result)
: "w"(a)
: /* No clobbers */);
return result;
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vreinterpretq_u64_u8 (uint8x16_t __a)
{
return (uint64x2_t) __a;
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrev32q_u8 (uint8x16_t a)
{
return __builtin_shuffle (a,
(uint8x16_t) { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 });
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vsetq_lane_u32 (uint32_t __elem, uint32x4_t __vec, const int __index)
{
return __aarch64_vset_lane_any (__elem, __vec, __index);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_s32 (int32_t *a, int32x4_t b)
{
__builtin_aarch64_st1v4si ((__builtin_aarch64_simd_si *) a, b);
}
__extension__ extern __inline void
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vst1q_u8 (uint8_t *a, uint8x16_t b)
{
__builtin_aarch64_st1v16qi ((__builtin_aarch64_simd_qi *) a,
(int8x16_t) b);
}
#endif
clang的最小集arm_neon.h,替换路径:${MAPLE_ROOT}/zeiss/prebuilt/sdk/llvm/clang+llvm-10.0.0-x86_64-linux-gnu-ubuntu-18.04/lib/clang/10.0.0/include/arm_neon.h
#ifndef __ARM_NEON_H
#define __ARM_NEON_H
#if !defined(__ARM_NEON)
#error "NEON support not enabled"
#endif
#include <stdint.h>
typedef __attribute__((neon_vector_type(2))) uint32_t uint32x2_t;
typedef __attribute__((neon_vector_type(2))) int32_t int32x2_t;
typedef __attribute__((neon_vector_type(4))) int32_t int32x4_t;
typedef __attribute__((neon_vector_type(4))) uint32_t uint32x4_t;
typedef __attribute__((neon_vector_type(1))) uint64_t uint64x1_t;
typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t; // vpaddlq_u16
typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t; // vpaddlq_u16
#define __ai static __inline__ __attribute__((__always_inline__, __nodebug__))
#ifdef __LITTLE_ENDIAN__
__ai int32x4_t vdupq_n_s32(int32_t __p0) {
int32x4_t __ret;
__ret = (int32x4_t) {__p0, __p0, __p0, __p0};
return __ret;
}
#else
__ai int32x4_t vdupq_n_s32(int32_t __p0) {
int32x4_t __ret;
__ret = (int32x4_t) {__p0, __p0, __p0, __p0};
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
uint8x16_t __ret;
__ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
return __ret;
}
#else
__ai uint8x16_t vdupq_n_u8(uint8_t __p0) {
uint8x16_t __ret;
__ret = (uint8x16_t) {__p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0, __p0};
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
uint8x16_t __s0 = __p0; \
uint8x16_t __s1 = __p1; \
uint8x16_t __ret; \
__ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 48); \
__ret; \
})
#else
#define vextq_u8(__p0, __p1, __p2) __extension__ ({ \
uint8x16_t __s0 = __p0; \
uint8x16_t __s1 = __p1; \
uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
uint8x16_t __ret; \
__ret = (uint8x16_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 48); \
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
__ret; \
})
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
uint64x1_t __ret;
__ret = __builtin_shufflevector(__p0, __p0, 1);
return __ret;
}
#else
__ai uint64x1_t vget_high_u64(uint64x2_t __p0) {
uint64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
uint64x1_t __ret;
__ret = __builtin_shufflevector(__rev0, __rev0, 1);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
#define vget_lane_u32(__p0, __p1) __extension__ ({ \
uint32x2_t __s0 = __p0; \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
__ret; \
})
#else
#define vget_lane_u32(__p0, __p1) __extension__ ({ \
uint32x2_t __s0 = __p0; \
uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__rev0, __p1); \
__ret; \
})
#define __noswap_vget_lane_u32(__p0, __p1) __extension__ ({ \
uint32x2_t __s0 = __p0; \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vget_lane_i32((int32x2_t)__s0, __p1); \
__ret; \
})
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
uint64x1_t __ret;
__ret = __builtin_shufflevector(__p0, __p0, 0);
return __ret;
}
#else
__ai uint64x1_t vget_low_u64(uint64x2_t __p0) {
uint64x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0);
uint64x1_t __ret;
__ret = __builtin_shufflevector(__rev0, __rev0, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
uint32x4_t __s0 = __p0; \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
__ret; \
})
#else
#define vgetq_lane_u32(__p0, __p1) __extension__ ({ \
uint32x4_t __s0 = __p0; \
uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__rev0, __p1); \
__ret; \
})
#define __noswap_vgetq_lane_u32(__p0, __p1) __extension__ ({ \
uint32x4_t __s0 = __p0; \
uint32_t __ret; \
__ret = (uint32_t) __builtin_neon_vgetq_lane_i32((int32x4_t)__s0, __p1); \
__ret; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vld1q_u8(__p0) __extension__ ({ \
uint8x16_t __ret; \
__ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
__ret; \
})
#else
#define vld1q_u8(__p0) __extension__ ({ \
uint8x16_t __ret; \
__ret = (uint8x16_t) __builtin_neon_vld1q_v(__p0, 48); \
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
__ret; \
})
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
uint32x4_t __ret;
__ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 50);
return __ret;
}
#else
__ai uint32x4_t vpaddlq_u16(uint16x8_t __p0) {
uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0);
uint32x4_t __ret;
__ret = (uint32x4_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 50);
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
uint64x2_t __ret;
__ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__p0, 51);
return __ret;
}
#else
__ai uint64x2_t vpaddlq_u32(uint32x4_t __p0) {
uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0);
uint64x2_t __ret;
__ret = (uint64x2_t) __builtin_neon_vpaddlq_v((int8x16_t)__rev0, 51);
__ret = __builtin_shufflevector(__ret, __ret, 1, 0);
return __ret;
}
#endif
__ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __p0) {
uint64x2_t __ret;
__ret = (uint64x2_t)(__p0);
return __ret;
}
#ifdef __LITTLE_ENDIAN__
__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
uint8x16_t __ret;
__ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
return __ret;
}
#else
__ai uint8x16_t vrev32q_u8(uint8x16_t __p0) {
uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
uint8x16_t __ret;
__ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
__ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
return __ret;
}
#endif
#ifdef __LITTLE_ENDIAN__
#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
uint32_t __s0 = __p0; \
uint32x4_t __s1 = __p1; \
uint32x4_t __ret; \
__ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
__ret; \
})
#else
#define vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
uint32_t __s0 = __p0; \
uint32x4_t __s1 = __p1; \
uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
uint32x4_t __ret; \
__ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__rev1, __p2); \
__ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \
__ret; \
})
#define __noswap_vsetq_lane_u32(__p0, __p1, __p2) __extension__ ({ \
uint32_t __s0 = __p0; \
uint32x4_t __s1 = __p1; \
uint32x4_t __ret; \
__ret = (uint32x4_t) __builtin_neon_vsetq_lane_i32(__s0, (int32x4_t)__s1, __p2); \
__ret; \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vst1q_s32(__p0, __p1) __extension__ ({ \
int32x4_t __s1 = __p1; \
__builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 34); \
})
#else
#define vst1q_s32(__p0, __p1) __extension__ ({ \
int32x4_t __s1 = __p1; \
int32x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \
__builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 34); \
})
#endif
#ifdef __LITTLE_ENDIAN__
#define vst1q_u8(__p0, __p1) __extension__ ({ \
uint8x16_t __s1 = __p1; \
__builtin_neon_vst1q_v(__p0, (int8x16_t)__s1, 48); \
})
#else
#define vst1q_u8(__p0, __p1) __extension__ ({ \
uint8x16_t __s1 = __p1; \
uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
__builtin_neon_vst1q_v(__p0, (int8x16_t)__rev1, 48); \
})
#endif
#endif
请问你们用gcc 或 clang arm_neon.h 的小集?
那表示mplcg 应该支持 __builtin_neon_vst1q_v(), __builtin_shufflevector() 等等? (用clang 的话)
这跟我们起初想的有点不一样, 但也差不多。
Alfred, 我们还是跟gcc保持一致,但是第一步先支持zhangjing提的最小集
Let me propose final list of Maple IR 128-bit vector PrimType's:
v2i64
v4i32
v8i16
v16i8
v2u64
v4u32
v8u16
v16u8
v2f64
v4f32
We'll also add the following 64-bit vector PrimType's:
v2i32
v4i16
v8i8
v2u32
v4u16
v8u8
v2f32
This cannot be done. A clang header cannot be used by a gcc compiler and vice versa. Each header contains builtins specific to its compiler. As long as Maple obliges to the same API as gcc/clang, we can have our own arm_neon.h, which we are forming now and to be shown shortly. This arm_neon.h will initially only contain neon intrinsics as shown in previous comment above. Each neon intrinsic function may contain a Maple builtin call recognized by the Maple frontend and to be implemented by the Maple code generator.
在前述从arm_neon.h抽取出来的15 intrinsics:
vdupq_n_s32
vdupq_n_u8
vextq_u8
vget_high_u64
vget_lane_u32
vget_low_u64
vgetq_lane_u32
vld1q_u8
vpaddlq_u16
vpaddlq_u32
vreinterpretq_u64_u8
vrev32q_u8
vsetq_lane_u32
vst1q_s32
vst1q_u8
uint32_t i;
int size = (sizeof(StructA) + (B - 1)) & ~(B - 1);
int32x4_t a = vdupq_n_s32(0);
int32x4_t* dst_ptr = (int32x4_t*)Func(mbuf); // Func return StructA *
for (i = 0; i < ((uint32_t)size >> sizeof(uint32_t)); i++) {
vst1q_s32((int32_t *)(dst_ptr + i), a);
}
# rte_cuckoo_hash.c中有调用vshlq_u16生成uint16x8_t
x = vshlq_u16(vandq_u16(vmat, vdupq_n_u16(0x8000)), shift);
*prim_hash_matches = (uint32_t)(vaddvq_u16(x));
# rte_vect.h中使用了vpaddlq_u16
static inline uint16_t
vaddvq_u16(uint16x8_t a)
{
uint32x4_t m = vpaddlq_u16(a);
uint64x2_t n = vpaddlq_u32(m);
uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
return vget_lane_u32((uint32x2_t)o, 0);
}
其实除了您问题2这段逻辑,其他用到neon intrinsic的代码都在开源dpdk中,我整理了一下具体文件,您可以直接check这里,被宏defined(__ARM_NEON)包裹的地方。后面我也会再check一下是否还有遗漏的
https://github.com/DPDK/dpdk/blob/main/lib/eal/arm/include/rte_memcpy_32.h
https://github.com/DPDK/dpdk/blob/main/lib/eal/arm/include/rte_vect.h
https://github.com/DPDK/dpdk/blob/main/lib/hash/rte_thash.h
https://github.com/DPDK/dpdk/blob/main/app/test-pmd/macswap_neon.h
https://github.com/DPDK/dpdk/blob/main/lib/hash/rte_cuckoo_hash.c
https://github.com/DPDK/dpdk/blob/main/lib/sched/rte_sched.c
https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_em.c
https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_neon.h
https://github.com/DPDK/dpdk/blob/main/examples/l3fwd/l3fwd_em_hlm_neon.h
以下这些应该是dpdk代码里的,上次统计遗漏的
vceqq_u16
vld1q_u16
vshlq_u16
vandq_u16
vdupq_n_u16
vaddvq_u16
vqtbl1q_u8
vandq_s32
vld1q_s32
vld1q_u32
vextq_u16
vgetq_lane_u16
vsetq_lane_u16
Would like to ask in the initial smallest subset of Neon intrinsics that need to be implemented:
vdupq_n_s32
vdupq_n_u8
vextq_u8
vget_high_u64
vget_lane_u32
vget_low_u64
vgetq_lane_u32
vld1q_u8
vpaddlq_u16
vpaddlq_u32
vreinterpretq_u64_u8
vrev32q_u8
vsetq_lane_u32
vst1q_s32
vst1q_u8
Would like to ask in mplcg, the scheduler made use of latency and some "feature attribute" such as kLtFLoad64, kLtLoad2 etc.
以下这些应该是dpdk代码里的,上次统计遗漏的
vceqq_u16
vld1q_u16
vshlq_u16
vandq_u16
vdupq_n_u16
vaddvq_u16
vqtbl1q_u8
vandq_s32
vld1q_s32
vld1q_u32
vextq_u16
vgetq_lane_u16
vsetq_lane_u16
Current Neon back-end status and estimates: I have the original 15 Neon intrinsic functions available in mplcg and was thinking of pushing it early next week after cleaning and testing. But since Maple considers PTY_1u64 to be target dependent and not to be supported, we will need a workaround for this. With the new workaround, it may add to the push time by a week. After that, probably another week to do the new additional 13 intrinsics found. There is also a realization that gcc/clang actually uses an extension, using vector operands on ordinary operators without using intrinsic for the operation. Do not know if that is needed for now, but saw the dpdk code shown above to use:
uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
That will be done after the 13 new intrinsics, and probably takes a week too.
1st phase Neon intrinsic supported PRed. Following functions supported: (Note vget_high_u64 and vget_low_u64 not valid, do not use).
int32x4_t vdupq_n_s32 (int32_t value)
uint8x16_t vdupq_n_u8 (uint8_t value)
uint8x16_t vextq_u8 (uint8x16_t a, uint8x16_t b, const int n)
uint32_t vget_lane_u32 (uint32x2_t v, const int lane)
uint32_t vgetq_lane_u32 (uint32x4_t v, const int lane)
uint8x16_t vld1q_u8 (uint8_t const * ptr)
uint32x4_t vpaddlq_u16 (uint16x8_t a)
uint64x2_t vpaddlq_u32 (uint32x4_t a)
uint64x2_t vreinterpretq_u64_u8 (uint8x16_t a)
uint8x16_t vrev32q_u8 (uint8x16_t vec)
uint32x4_t vsetq_lane_u32 (uint32_t a, uint32x4_t v, const int lane)
void vst1q_s32 (int32_t * ptr, int32x4_t val)
void vst1q_u8 (uint8_t * ptr, uint8x16_t val)
According to the ARM Developer's Neon Intrinsic Guide, the second argument of vshlq_u16 is a signed int8x16_t. But the original list + the additional list requested last week does not have anyway to create an int8x16_t data. At the minimum, there should be at least a vld1q_s16() or other vcreate_s16() needed?
I take it back, an int16x8_t can be initialized by
int16x8_t c = { 1, 1, 1, 1, 2, 2, 2, 2 };
without an intrinsic.
clang2mpl encountered some neon intrinsic errors while compiling the new code.
Seemed some types and functions are not supported.
// type
uint32x2x2_t
// function
vmlal_u32
vdup_n_u32
vshrq_n_u64
veorq_u64
vld1q_u32
veorq_u32
vmull_u32
vreinterpretq_u32_u64
vshlq_n_u64
vmlal_u32
These were not on the previous list that I had implemented. I will add support for these in the frontend and discuss with Alfred about the backend support.
There are many Neon intrinsics, we support only the ones requested. This is thus a new additional list. Supporting new additional intriniscs is ok. But supporting the type uint32x2x2_t in 2 registers is a very big deal. Particularly with the calling convention of this type. Could you show an example how this type is used? Which intrinsic function is accessing this type? Is this type being passed as argument and returned as function result.
uint32x4_t const key1 = vld1q_u32 (x + (i * 4));
uint32x4_t const data = veorq_u32 (vreinterpretq_u32_u64(data1), key1);
uint32x2x2_t const s = vzip_u32 (vget_low_u32(data), vget_high_u32(data));
uint64x2_t const hi = vmull_u32 (s.val[1], p);
Thanks! Is that the only type of use. Are there function arguments of this data type? Are there functions with this data type as return value? Are there data initialization of this data type?
Sorry. This vector array type cannot be used with data initialization, so please skip the last question.
I found an answer to my questions. Please ignore.
Support of
vmlal_u32
vdup_n_u32
vshrq_n_u64
veorq_u64
vld1q_u32
veorq_u32
vmull_u32
vreinterpretq_u32_u64
vshlq_n_u64
vmlal_u32
added and PRed.
+ /home/z00518955/OpenArkCompiler/output/aarch64-clang-release/bin/clang2mpl --ascii VgetLane.c -- --target=aarch64-linux-elf -Wno-return-type -U__SIZEOF_INT128__
+ /home/z00518955/OpenArkCompiler/output/aarch64-clang-release/bin/maple -O0 VgetLane.mpl
Starting:/home/z00518955/OpenArkCompiler/output/aarch64-clang-release/bin/maple -O0 VgetLane.mpl
Starting maplecomb
Starting parse input
Parse consumed 0s
Processing maplecomb
maplecomb consumed 0s
Starting mplcg
Processing mplcg
Mplcg consumed 0s
+ aarch64-linux-gnu-gcc -std=c89 -o VgetLane.out VgetLane.s
VgetLane.s: Assembler messages:
VgetLane.s:40: Error: operand mismatch -- `umov w0,v0.d[0]'
VgetLane.s:40: Info: did you mean this?
VgetLane.s:40: Info: umov w0, v0.h[0]
VgetLane.s:40: Info: other valid variant(s):
VgetLane.s:40: Info: umov w0, v0.s[0]
VgetLane.s:40: Info: umov x0, v0.d[0]
VgetLane.s:40: Info: umov w0, v0.b[0]
uint32x2x2_t instead of int32x2x2_t -- Don't understand, int32x2x2_t not recognized??
------
This should be uint32x2x2_t in arm_neon.h, the above reply mentioned:https://gitee.com/openarkcompiler/OpenArkCompiler/issues/I3ONI7#note_5468818_link
Thank you very much for your work!
Yes. As mentioned, vector array types are not verified yet. We will be completing it soon. Thanks!
Just want to make sure. You mean you need uint32x2x2_t to be used with zip_u32(), right?
Our current BE cannot handle vector array. Current infrastructure cannot handle this. I'm working on it.
Just want to make sure. You mean you need uint32x2x2_t to be used with zip_u32(), right?
---
yes
One issue of vector array being it's possible to have something big yet passed in registers. Let's say int64x2x4 is a 4x128-bit entity and they are to be passed/returned in 4 fp/simd regs. This is different from the ABI that struct larger than 16-bytes are passed/returned in memory. To support these 128-bit vectors in array of [3] or [4] requires much more calling convention enhancement in mplcg. If uint32x2x2_t is what you are asking at the moment, I would like to support 64-bit registers up to array of 2 only for now. Namely, the next arm_neon.h change will include
int8x8x2_t, int16x4x2_t, int32x2x2_t, uint8x8x2_t, uint16x4x2_t, uint32x2x2_t
which is 64-bit registers up to array of 2.
Please let me know if this is not acceptable for now. Thanks!
That is OK, vector array only needs uint32x2x2_t in the hlibc code. Thanks for your support
登录 后才可以发表评论