From 9111bf0b9354c87f4dd897ee9cd1cb35a6431acb Mon Sep 17 00:00:00 2001 From: Alfred Huang Date: Sat, 3 Jul 2021 16:31:11 -0700 Subject: [PATCH 1/2] Enhance arm_neon.h for all datatypes Relevant intrinsics are defined for all datatypes. Revert back to using u64/i64 for the single-element vector type.s --- Clang2MapleVisitor.cpp | 4 +- sys/include/arm_neon.h | 668 ++++++++++++++++++++++++++++++++++++++--- 2 files changed, 627 insertions(+), 45 deletions(-) diff --git a/Clang2MapleVisitor.cpp b/Clang2MapleVisitor.cpp index 1b8cb11..0b69e20 100644 --- a/Clang2MapleVisitor.cpp +++ b/Clang2MapleVisitor.cpp @@ -3602,7 +3602,7 @@ TyIdx Clang2MapleVisitor::type2MplIdx(clang::QualType QT, bool needComplete) { switch (ElemTy->GetPrimType()) { case PTY_i64: if (NumElements == 1) { - TI = GlobalTables::GetTypeTable().GetPrimType(PTY_f64)->GetTypeIndex(); + TI = GlobalTables::GetTypeTable().GetPrimType(PTY_i64)->GetTypeIndex(); } else if (NumElements == 2) { TI = GlobalTables::GetTypeTable().GetPrimType(PTY_v2i64)->GetTypeIndex(); @@ -3650,7 +3650,7 @@ TyIdx Clang2MapleVisitor::type2MplIdx(clang::QualType QT, bool needComplete) { break; case PTY_u64: if (NumElements == 1) { - TI = GlobalTables::GetTypeTable().GetPrimType(PTY_f64)->GetTypeIndex(); + TI = GlobalTables::GetTypeTable().GetPrimType(PTY_u64)->GetTypeIndex(); } else if (NumElements == 2) { TI = GlobalTables::GetTypeTable().GetPrimType(PTY_v2u64)->GetTypeIndex(); diff --git a/sys/include/arm_neon.h b/sys/include/arm_neon.h index ddeeb48..35e0146 100644 --- a/sys/include/arm_neon.h +++ b/sys/include/arm_neon.h @@ -260,15 +260,12 @@ uint16_t __builtin_mpl_vector_sum_v8u16(uint16x8_t); uint8_t __builtin_mpl_vector_sum_v16u8(uint8x16_t); float64_t __builtin_mpl_vector_sum_v2f64(float64x2_t); float32_t __builtin_mpl_vector_sum_v4f32(float32x4_t); -int64_t __builtin_mpl_vector_sum_v1i64(int64x1_t); int32_t __builtin_mpl_vector_sum_v2i32(int32x2_t); int16_t __builtin_mpl_vector_sum_v4i16(int16x4_t); int8_t __builtin_mpl_vector_sum_v8i8(int8x8_t); -uint64_t __builtin_mpl_vector_sum_v1u64(uint64x1_t); uint32_t __builtin_mpl_vector_sum_v2u32(uint32x2_t); uint16_t __builtin_mpl_vector_sum_v4u16(uint16x4_t); uint8_t __builtin_mpl_vector_sum_v8u8(uint8x8_t); -float64_t __builtin_mpl_vector_sum_v1f64(float64x1_t); float32_t __builtin_mpl_vector_sum_v2f32(float32x2_t); // vecTy table_lookup(vecTy tbl, vecTy idx) @@ -340,56 +337,641 @@ void __builtin_mpl_vector_store_v8u8(uint8_t *, uint8x8_t); void __builtin_mpl_vector_store_v1f64(float64_t *, float64x1_t); void __builtin_mpl_vector_store_v2f32(float32_t *, float32x2_t); -// Temporary builtins that should be replaced by standard ops. -uint16x8_t __builtin_mpl_vector_and_v8u16(uint16x8_t, uint16x8_t); -int32x4_t __builtin_mpl_vector_and_v4i32(int32x4_t, int32x4_t); -uint16x8_t __builtin_mpl_vector_eq_v8u16(uint16x8_t, uint16x8_t); -uint16x8_t __builtin_mpl_vector_shl_v8u16(uint16x8_t, int16x8_t); -uint64x2_t __builtin_mpl_vector_shli_v2u64(uint64x2_t, const int); -uint64x2_t __builtin_mpl_vector_shri_v2u64(uint64x2_t, const int); -uint32x4_t __builtin_mpl_vector_xor_v4u32(uint32x4_t, uint32x4_t); -uint64x2_t __builtin_mpl_vector_xor_v2u64(uint64x2_t, uint64x2_t); +// ************************* +// Supported Neon Intrinsics +// ************************* +// vaddv +#define vaddv_s8(a) __builtin_mpl_vector_sum_v8i8(a) +#define vaddv_s16(a) __builtin_mpl_vector_sum_v4i16(a) +#define vaddv_s32(a) __builtin_mpl_vector_sum_v2i32(a) +#define vaddv_u8 (a) __builtin_mpl_vector_sum_v8u8(a) +#define vaddv_u16(a) __builtin_mpl_vector_sum_v4u16(a) +#define vaddv_u32(a) __builtin_mpl_vector_sum_v2u32(a) +#define vaddv_f32(a) __builtin_mpl_vector_sum_v2f32(a) +#define vaddvq_s8(a) __builtin_mpl_vector_sum_v16i8(a) +#define vaddvq_s16(a) __builtin_mpl_vector_sum_v8i16(a) +#define vaddvq_s32(a) __builtin_mpl_vector_sum_v4i32(a) +#define vaddvq_s64(a) __builtin_mpl_vector_sum_v2i64(a) +#define vaddvq_u8(a) __builtin_mpl_vector_sum_v16u8(a) #define vaddvq_u16(a) __builtin_mpl_vector_sum_v8u16(a) -#define vandq_u16(a, b) __builtin_mpl_vector_and_v8u16(a, b) // (a & b) -#define vandq_s32(a, b) __builtin_mpl_vector_and_v4i32(a, b) // (a & b) -#define vdupq_n_s32(value) __builtin_mpl_vector_from_scalar_v4i32(value) -#define vdupq_n_u16(value) __builtin_mpl_vector_from_scalar_v8u16(value) -#define vdupq_n_u8(value) __builtin_mpl_vector_from_scalar_v16u8(value) -#define vdup_n_u32(value) __builtin_mpl_vector_from_scalar_v2u32(value) -#define vceqq_u16(a, b) __builtin_mpl_vector_eq_v8u16(a, b) // (a == b) -#define veorq_u32(a, b) __builtin_mpl_vector_xor_v4u32(a, b) // (a ^ b) -#define veorq_u64(a, b) __builtin_mpl_vector_xor_v2u64(a, b) // (a ^ b) +#define vaddvq_u32(a) __builtin_mpl_vector_sum_v4u32(a) +#define vaddvq_u64(a) __builtin_mpl_vector_sum_v2u64(a) +#define vaddvq_f32(a) __builtin_mpl_vector_sum_v4f32(a) +#define vaddvq_f64(a) __builtin_mpl_vector_sum_v2f64(a) + +// vadd +#define vadd_s8(a, b) (a + b) +#define vadd_s16(a, b) (a + b) +#define vadd_s32(a, b) (a + b) +#define vadd_s64(a, b) (a + b) +#define vadd_u8(a, b) (a + b) +#define vadd_u16(a, b) (a + b) +#define vadd_u32(a, b) (a + b) +#define vadd_u64(a, b) (a + b) +#define vadd_f16(a, b) (a + b) +#define vadd_f32(a, b) (a + b) +#define vadd_f64(a, b) (a + b) +#define vaddq_s8(a, b) (a + b) +#define vaddq_s16(a, b) (a + b) +#define vaddq_s32(a, b) (a + b) +#define vaddq_s64(a, b) (a + b) +#define vaddq_u8(a, b) (a + b) +#define vaddq_u16(a, b) (a + b) +#define vaddq_u32(a, b) (a + b) +#define vaddq_u64(a, b) (a + b) +#define vaddq_f16(a, b) (a + b) +#define vaddq_f32(a, b) (a + b) +#define vaddq_f64(a, b) (a + b) + +// vand +#define vand_s8(a, b) (a & b) +#define vand_s16(a, b) (a & b) +#define vand_s32(a, b) (a & b) +#define vand_s64(a, b) (a & b) +#define vand_u8(a, b) (a & b) +#define vand_u16(a, b) (a & b) +#define vand_u32(a, b) (a & b) +#define vand_u64(a, b) (a & b) +#define vandq_s8(a, b) (a & b) +#define vandq_s16(a, b) (a & b) +#define vandq_s32(a, b) (a & b) +#define vandq_s64(a, b) (a & b) +#define vandq_u8(a, b) (a & b) +#define vandq_u16(a, b) (a & b) +#define vandq_u32(a, b) (a & b) +#define vandq_u64(a, b) (a & b) + +// vdup +#define vdup_n_s8(a) __builtin_mpl_vector_from_scalar_v8i8(a) +#define vdup_n_s16(a) __builtin_mpl_vector_from_scalar_v4i16(a) +#define vdup_n_s32(a) __builtin_mpl_vector_from_scalar_v2i32(a) +#define vdup_n_s64(a) __builtin_mpl_vector_from_scalar_v1i64(a) +#define vdup_n_u8(a) __builtin_mpl_vector_from_scalar_v8u8(a) +#define vdup_n_u16(a) __builtin_mpl_vector_from_scalar_v4u16(a) +#define vdup_n_u32(a) __builtin_mpl_vector_from_scalar_v2u32(a) +#define vdup_n_u64(a) __builtin_mpl_vector_from_scalar_v1u64(a) +#define vdup_n_f16(a) __builtin_mpl_vector_from_scalar_v4f16(a) +#define vdup_n_f32(a) __builtin_mpl_vector_from_scalar_v2f32(a) +#define vdup_n_f64(a) __builtin_mpl_vector_from_scalar_v1f64(a) +#define vdupq_n_s8(a) __builtin_mpl_vector_from_scalar_v16i8(a) +#define vdupq_n_s16(a) __builtin_mpl_vector_from_scalar_v8i16(a) +#define vdupq_n_s32(a) __builtin_mpl_vector_from_scalar_v4i32(a) +#define vdupq_n_s64(a) __builtin_mpl_vector_from_scalar_v2i64(a) +#define vdupq_n_u8(a) __builtin_mpl_vector_from_scalar_v16u8(a) +#define vdupq_n_u16(a) __builtin_mpl_vector_from_scalar_v8u16(a) +#define vdupq_n_u32(a) __builtin_mpl_vector_from_scalar_v4u32(a) +#define vdupq_n_u64(a) __builtin_mpl_vector_from_scalar_v2u64(a) +#define vdupq_n_f16(a) __builtin_mpl_vector_from_scalar_v8f16(a) +#define vdupq_n_f32(a) __builtin_mpl_vector_from_scalar_v4f32(a) +#define vdupq_n_f64(a) __builtin_mpl_vector_from_scalar_v2f64(a) + +// vceq +#define vceq_s8(a, b) (a == b) +#define vceq_s16(a, b) (a == b) +#define vceq_s32(a, b) (a == b) +#define vceq_s64(a, b) (a == b) +#define vceq_u8(a, b) (a == b) +#define vceq_u16(a, b) (a == b) +#define vceq_u32(a, b) (a == b) +#define vceq_u64(a, b) (a == b) +#define vceq_f16(a, b) (a == b) +#define vceq_f32(a, b) (a == b) +#define vceq_f64(a, b) (a == b) +#define vceqq_s8(a, b) (a == b) +#define vceqq_s16(a, b) (a == b) +#define vceqq_s32(a, b) (a == b) +#define vceqq_s64(a, b) (a == b) +#define vceqq_u8(a, b) (a == b) +#define vceqq_u16(a, b) (a == b) +#define vceqq_u32(a, b) (a == b) +#define vceqq_u64(a, b) (a == b) +#define vceqq_f16(a, b) (a == b) +#define vceqq_f32(a, b) (a == b) +#define vceqq_f64(a, b) (a == b) + +// vcgt +#define vcgt_s8(a, b) (a > b) +#define vcgt_s16(a, b) (a > b) +#define vcgt_s32(a, b) (a > b) +#define vcgt_s64(a, b) (a > b) +#define vcgt_u8(a, b) (a > b) +#define vcgt_u16(a, b) (a > b) +#define vcgt_u32(a, b) (a > b) +#define vcgt_u64(a, b) (a > b) +#define vcgt_f16(a, b) (a > b) +#define vcgt_f32(a, b) (a > b) +#define vcgt_f64(a, b) (a > b) +#define vcgtq_s8(a, b) (a > b) +#define vcgtq_s16(a, b) (a > b) +#define vcgtq_s32(a, b) (a > b) +#define vcgtq_s64(a, b) (a > b) +#define vcgtq_u8(a, b) (a > b) +#define vcgtq_u16(a, b) (a > b) +#define vcgtq_u32(a, b) (a > b) +#define vcgtq_u64(a, b) (a > b) +#define vcgtq_f16(a, b) (a > b) +#define vcgtq_f32(a, b) (a > b) +#define vcgtq_f64(a, b) (a > b) + +// vcge +#define vcge_s8(a, b) (a >= b) +#define vcge_s16(a, b) (a >= b) +#define vcge_s32(a, b) (a >= b) +#define vcge_s64(a, b) (a >= b) +#define vcge_u8(a, b) (a >= b) +#define vcge_u16(a, b) (a >= b) +#define vcge_u32(a, b) (a >= b) +#define vcge_u64(a, b) (a >= b) +#define vcge_f16(a, b) (a >= b) +#define vcge_f32(a, b) (a >= b) +#define vcge_f64(a, b) (a >= b) +#define vcgeq_s8(a, b) (a >= b) +#define vcgeq_s16(a, b) (a >= b) +#define vcgeq_s32(a, b) (a >= b) +#define vcgeq_s64(a, b) (a >= b) +#define vcgeq_u8(a, b) (a >= b) +#define vcgeq_u16(a, b) (a >= b) +#define vcgeq_u32(a, b) (a >= b) +#define vcgeq_u64(a, b) (a >= b) +#define vcgeq_f16(a, b) (a >= b) +#define vcgeq_f32(a, b) (a >= b) +#define vcgeq_f64(a, b) (a >= b) + +// vclt +#define vclt_s8(a, b) (a < b) +#define vclt_s16(a, b) (a < b) +#define vclt_s32(a, b) (a < b) +#define vclt_s64(a, b) (a < b) +#define vclt_u8(a, b) (a < b) +#define vclt_u16(a, b) (a < b) +#define vclt_u32(a, b) (a < b) +#define vclt_u64(a, b) (a < b) +#define vclt_f16(a, b) (a < b) +#define vclt_f32(a, b) (a < b) +#define vclt_f64(a, b) (a < b) +#define vcltq_s8(a, b) (a < b) +#define vcltq_s16(a, b) (a < b) +#define vcltq_s32(a, b) (a < b) +#define vcltq_s64(a, b) (a < b) +#define vcltq_u8(a, b) (a < b) +#define vcltq_u16(a, b) (a < b) +#define vcltq_u32(a, b) (a < b) +#define vcltq_u64(a, b) (a < b) +#define vcltq_f16(a, b) (a < b) +#define vcltq_f32(a, b) (a < b) +#define vcltq_f64(a, b) (a < b) + +// vcle +#define vcle_s8(a, b) (a <= b) +#define vcle_s16(a, b) (a <= b) +#define vcle_s32(a, b) (a <= b) +#define vcle_s64(a, b) (a <= b) +#define vcle_u8(a, b) (a <= b) +#define vcle_u16(a, b) (a <= b) +#define vcle_u32(a, b) (a <= b) +#define vcle_u64(a, b) (a <= b) +#define vcle_f16(a, b) (a <= b) +#define vcle_f32(a, b) (a <= b) +#define vcle_f64(a, b) (a <= b) +#define vcleq_s8(a, b) (a <= b) +#define vcleq_s16(a, b) (a <= b) +#define vcleq_s32(a, b) (a <= b) +#define vcleq_s64(a, b) (a <= b) +#define vcleq_u8(a, b) (a <= b) +#define vcleq_u16(a, b) (a <= b) +#define vcleq_u32(a, b) (a <= b) +#define vcleq_u64(a, b) (a <= b) +#define vcleq_f16(a, b) (a <= b) +#define vcleq_f32(a, b) (a <= b) +#define vcleq_f64(a, b) (a <= b) + +// veor +#define veor_s8(a, b) (a ^ b) +#define veor_s16(a, b) (a ^ b) +#define veor_s32(a, b) (a ^ b) +#define veor_s64(a, b) (a ^ b) +#define veor_u8(a, b) (a ^ b) +#define veor_u16(a, b) (a ^ b) +#define veor_u32(a, b) (a ^ b) +#define veor_u64(a, b) (a ^ b) +#define veorq_s8(a, b) (a ^ b) +#define veorq_s16(a, b) (a ^ b) +#define veorq_s32(a, b) (a ^ b) +#define veorq_s64(a, b) (a ^ b) +#define veorq_u8(a, b) (a ^ b) +#define veorq_u16(a, b) (a ^ b) +#define veorq_u32(a, b) (a ^ b) +#define veorq_u64(a, b) (a ^ b) + +// vext +#define vext_s8(a, b, n) __builtin_mpl_vector_merge_v8i8(a, b, n) +#define vext_s16(a, b, n) __builtin_mpl_vector_merge_v4i16(a, b, n) +#define vext_s32(a, b, n) __builtin_mpl_vector_merge_v2i32(a, b, n) +#define vext_s64(a, b, n) __builtin_mpl_vector_merge_v1i64(a, b, n) +#define vext_u8(a, b, n) __builtin_mpl_vector_merge_v8u8(a, b, n) +#define vext_u16(a, b, n) __builtin_mpl_vector_merge_v4u16(a, b, n) +#define vext_u32(a, b, n) __builtin_mpl_vector_merge_v2u32(a, b, n) +#define vext_u64(a, b, n) __builtin_mpl_vector_merge_v1u64(a, b, n) +#define vext_f16(a, b, n) __builtin_mpl_vector_merge_v4f16(a, b, n) +#define vext_f32(a, b, n) __builtin_mpl_vector_merge_v2f32(a, b, n) +#define vext_f64(a, b, n) __builtin_mpl_vector_merge_v1f64(a, b, n) +#define vextq_s8(a, b, n) __builtin_mpl_vector_merge_v16i8(a, b, n) +#define vextq_s16(a, b, n) __builtin_mpl_vector_merge_v8i16(a, b, n) +#define vextq_s32(a, b, n) __builtin_mpl_vector_merge_v4i32(a, b, n) +#define vextq_s64(a, b, n) __builtin_mpl_vector_merge_v2i64(a, b, n) #define vextq_u8(a, b, n) __builtin_mpl_vector_merge_v16u8(a, b, n) #define vextq_u16(a, b, n) __builtin_mpl_vector_merge_v8u16(a, b, n) +#define vextq_u32(a, b, n) __builtin_mpl_vector_merge_v4u32(a, b, n) +#define vextq_u64(a, b, n) __builtin_mpl_vector_merge_v2u64(a, b, n) +#define vextq_f16(a, b, n) __builtin_mpl_vector_merge_v8f16(a, b, n) +#define vextq_f32(a, b, n) __builtin_mpl_vector_merge_v4f32(a, b, n) +#define vextq_f64(a, b, n) __builtin_mpl_vector_merge_v2f64(a, b, n) + +// vget_high +#define vget_high_s8(a) __builtin_mpl_vector_get_high_v16i8(a) +#define vget_high_s16(a) __builtin_mpl_vector_get_high_v8i16(a) +#define vget_high_s32(a) __builtin_mpl_vector_get_high_v4i32(a) +#define vget_high_s64(a) __builtin_mpl_vector_get_high_v2i64(a) +#define vget_high_u8(a) __builtin_mpl_vector_get_high_v16u8(a) +#define vget_high_u16(a) __builtin_mpl_vector_get_high_v8u16(a) +#define vget_high_u32(a) __builtin_mpl_vector_get_high_v4u32(a) #define vget_high_u64(a) __builtin_mpl_vector_get_high_v2u64(a) +#define vget_high_f16(a) __builtin_mpl_vector_get_high_v4f16(a) +#define vget_high_f32(a) __builtin_mpl_vector_get_high_v2f32(a) +#define vget_high_f64(a) __builtin_mpl_vector_get_high_v1f64(a) + +// vget_lane +#define vget_lane_s8(a, n) __builtin_mpl_vector_get_element_v8i8(a, n) +#define vget_lane_s16(a, n) __builtin_mpl_vector_get_element_v4i16(a, n) +#define vget_lane_s32(a, n) __builtin_mpl_vector_get_element_v2i32(a, n) +#define vget_lane_s64(a, n) __builtin_mpl_vector_get_element_v1i64(a, n) +#define vget_lane_u8(a, n) __builtin_mpl_vector_get_element_v8u8(a, n) +#define vget_lane_u16(a, n) __builtin_mpl_vector_get_element_v4u16(a, n) +#define vget_lane_u32(a, n) __builtin_mpl_vector_get_element_v2u32(a, n) +#define vget_lane_u64(a, n) __builtin_mpl_vector_get_element_v1u64(a, n) +#define vget_lane_f16(a, n) __builtin_mpl_vector_get_element_v4f16(a, n) +#define vget_lane_f32(a, n) __builtin_mpl_vector_get_element_v2f32(a, n) +#define vget_lane_f64(a, n) __builtin_mpl_vector_get_element_v1f64(a, n) +#define vgetq_lane_s8(a, n) __builtin_mpl_vector_get_element_v16i8(a, n) +#define vgetq_lane_s16(a, n) __builtin_mpl_vector_get_element_v8i16(a, n) +#define vgetq_lane_s32(a, n) __builtin_mpl_vector_get_element_v4i32(a, n) +#define vgetq_lane_s64(a, n) __builtin_mpl_vector_get_element_v2i64(a, n) +#define vgetq_lane_u8(a, n) __builtin_mpl_vector_get_element_v16u8(a, n) +#define vgetq_lane_u16(a, n) __builtin_mpl_vector_get_element_v8u16(a, n) +#define vgetq_lane_u32(a, n) __builtin_mpl_vector_get_element_v4u32(a, n) +#define vgetq_lane_u64(a, n) __builtin_mpl_vector_get_element_v2u64(a, n) +#define vgetq_lane_f16(a, n) __builtin_mpl_vector_get_element_v8f16(a, n) +#define vgetq_lane_f32(a, n) __builtin_mpl_vector_get_element_v4f32(a, n) +#define vgetq_lane_f64(a, n) __builtin_mpl_vector_get_element_v2f64(a, n) + +// vget_low +#define vget_low_s8(a) __builtin_mpl_vector_get_low_v16i8(a) +#define vget_low_s16(a) __builtin_mpl_vector_get_low_v8i16(a) +#define vget_low_s32(a) __builtin_mpl_vector_get_low_v4i32(a) +#define vget_low_s64(a) __builtin_mpl_vector_get_low_v2i64(a) +#define vget_low_u8(a) __builtin_mpl_vector_get_low_v16u8(a) +#define vget_low_u16(a) __builtin_mpl_vector_get_low_v8u16(a) +#define vget_low_u32(a) __builtin_mpl_vector_get_low_v4u32(a) #define vget_low_u64(a) __builtin_mpl_vector_get_low_v2u64(a) -#define vget_lane_u32(vec, lane) \ - __builtin_mpl_vector_get_element_v2u32(vec, lane) -#define vgetq_lane_u32(vec, lane) \ - __builtin_mpl_vector_get_element_v4u32(vec, lane) -#define vgetq_lane_u16(vec, lane) \ - __builtin_mpl_vector_get_element_v8u16(vec, lane) -#define vld1q_u8(ptr) __builtin_mpl_vector_load_v16u8(ptr) -#define vld1q_u16(ptr) __builtin_mpl_vector_load_v8u16(ptr) -#define vld1q_s32(ptr) __builtin_mpl_vector_load_v4i32(ptr) -#define vld1q_u32(ptr) __builtin_mpl_vector_load_v4u32(ptr) -#define vmlal_u32(accum, s1, s2) __builtin_mpl_vector_madd_v2u32(accum, s1, s2) +#define vget_low_f16(a) __builtin_mpl_vector_get_low_v4f16(a) +#define vget_low_f32(a) __builtin_mpl_vector_get_low_v2f32(a) +#define vget_low_f64(a) __builtin_mpl_vector_get_low_v1f64(a) + +// vld1 +#define vld1_s8(a) __builtin_mpl_vector_load_v8i8(a) +#define vld1_s16(a) __builtin_mpl_vector_load_v4i16(a) +#define vld1_s32(a) __builtin_mpl_vector_load_v2i32(a) +#define vld1_s64(a) __builtin_mpl_vector_load_v1i64(a) +#define vld1_u8(a) __builtin_mpl_vector_load_v8u8(a) +#define vld1_u16(a) __builtin_mpl_vector_load_v4u16(a) +#define vld1_u32(a) __builtin_mpl_vector_load_v2u32(a) +#define vld1_u64(a) __builtin_mpl_vector_load_v1u64(a) +#define vld1_f16(a) __builtin_mpl_vector_load_v4f16(a) +#define vld1_f32(a) __builtin_mpl_vector_load_v2f32(a) +#define vld1_f64(a) __builtin_mpl_vector_load_v1f64(a) +#define vld1q_s8(a) __builtin_mpl_vector_load_v16i8(a) +#define vld1q_s16(a) __builtin_mpl_vector_load_v8i16(a) +#define vld1q_s32(a) __builtin_mpl_vector_load_v4i32(a) +#define vld1q_s64(a) __builtin_mpl_vector_load_v2i64(a) +#define vld1q_u8(a) __builtin_mpl_vector_load_v16u8(a) +#define vld1q_u16(a) __builtin_mpl_vector_load_v8u16(a) +#define vld1q_u32(a) __builtin_mpl_vector_load_v4u32(a) +#define vld1q_u64(a) __builtin_mpl_vector_load_v2u64(a) +#define vld1q_f16(a) __builtin_mpl_vector_load_v8f16(a) +#define vld1q_f32(a) __builtin_mpl_vector_load_v4f32(a) +#define vld1q_f64(a) __builtin_mpl_vector_load_v2f64(a) + +// vmlal +#define vmlal_s8(acc, a, b) __builtin_mpl_vector_madd_v8i8(acc, a, b) +#define vmlal_s16(acc, a, b) __builtin_mpl_vector_madd_v4i16(acc, a, b) +#define vmlal_s32(acc, a, b) __builtin_mpl_vector_madd_v2i32(acc, a, b) +#define vmlal_u8(acc, a, b) __builtin_mpl_vector_madd_v8u8(acc, a, b) +#define vmlal_u16(acc, a, b) __builtin_mpl_vector_madd_v4u16(acc, a, b) +#define vmlal_u32(acc, a, b) __builtin_mpl_vector_madd_v2u32(acc, a, b) + +// vmull +#define vmull_s8(a, b) __builtin_mpl_vector_mul_v8i8(a, b) +#define vmull_s16(a, b) __builtin_mpl_vector_mul_v4i16(a, b) +#define vmull_s32(a, b) __builtin_mpl_vector_mul_v2i32(a, b) +#define vmull_u8(a, b) __builtin_mpl_vector_mul_v8u8(a, b) +#define vmull_u16(a, b) __builtin_mpl_vector_mul_v4u16(a, b) #define vmull_u32(a, b) __builtin_mpl_vector_mul_v2u32(a, b) + +// vor +#define vor_s8(a, b) (a | b) +#define vor_s16(a, b) (a | b) +#define vor_s32(a, b) (a | b) +#define vor_s64(a, b) (a | b) +#define vor_u8(a, b) (a | b) +#define vor_u16(a, b) (a | b) +#define vor_u32(a, b) (a | b) +#define vor_u64(a, b) (a | b) +#define vorq_s8(a, b) (a | b) +#define vorq_s16(a, b) (a | b) +#define vorq_s32(a, b) (a | b) +#define vorq_s64(a, b) (a | b) +#define vorq_u8(a, b) (a | b) +#define vorq_u16(a, b) (a | b) +#define vorq_u32(a, b) (a | b) +#define vorq_u64(a, b) (a | b) + +// vpaddl +#define vpaddl_s8(a) __builtin_mpl_vector_pairwise_add_v8i8(a) +#define vpaddl_s16(a) __builtin_mpl_vector_pairwise_add_v4i16(a) +#define vpaddl_s32(a) __builtin_mpl_vector_pairwise_add_v2i32(a) +#define vpaddl_u8(a) __builtin_mpl_vector_pairwise_add_v8u8(a) +#define vpaddl_u16(a) __builtin_mpl_vector_pairwise_add_v4u16(a) +#define vpaddl_u32(a) __builtin_mpl_vector_pairwise_add_v2u32(a) +#define vpaddlq_s8(a) __builtin_mpl_vector_pairwise_add_v16i8(a) +#define vpaddlq_s16(a) __builtin_mpl_vector_pairwise_add_v8i16(a) +#define vpaddlq_s32(a) __builtin_mpl_vector_pairwise_add_v4i32(a) +#define vpaddlq_u8(a) __builtin_mpl_vector_pairwise_add_v16u8(a) #define vpaddlq_u16(a) __builtin_mpl_vector_pairwise_add_v8u16(a) #define vpaddlq_u32(a) __builtin_mpl_vector_pairwise_add_v4u32(a) -#define vqtbl1q_u8(t, idx) __builtin_mpl_vector_table_lookup_v16u8(t, idx) + +// vqtbl1 +#define vqtbl1_s8(a, b) __builtin_mpl_vector_table_lookup_v8i8(a, b) +#define vqtbl1_u8(a, b) __builtin_mpl_vector_table_lookup_v8u8(a, b) +#define vqtbl1q_s8(a, b) __builtin_mpl_vector_table_lookup_v16i8(a, b) +#define vqtbl1q_u8(a, b) __builtin_mpl_vector_table_lookup_v16u8(a, b) + +// vreinterpret 8 +#define vreinterpret_s16_s8(a) ((int16x4_t)a) +#define vreinterpret_s32_s8(a) ((int32x2_t)a) +#define vreinterpret_s64_s8(a) ((int64x1_t)a) +#define vreinterpret_u16_u8(a) ((uint16x4_t)a) +#define vreinterpret_u32_u8(a) ((uint32x2_t)a) +#define vreinterpret_u64_u8(a) ((uint64x1_t)a) +#define vreinterpret_f16_s8(a) ((float16x4_t)a) +#define vreinterpret_f32_s8(a) ((float32x2_t)a) +#define vreinterpret_f64_s8(a) ((float64x1_t)a) +#define vreinterpret_f16_u8(a) ((float16x4_t)a) +#define vreinterpret_f32_u8(a) ((float32x2_t)a) +#define vreinterpret_f64_u8(a) ((float64x1_t)a) +#define vreinterpretq_s16_s8(a) ((int16x8_t)a) +#define vreinterpretq_s32_s8(a) ((int32x4_t)a) +#define vreinterpretq_s64_s8(a) ((int64x2_t)a) +#define vreinterpretq_u16_u8(a) ((uint16x8_t)a) +#define vreinterpretq_u32_u8(a) ((uint32x4_t)a) #define vreinterpretq_u64_u8(a) ((uint64x2_t)a) +#define vreinterpretq_f16_s8(a) ((float16x8_t)a) +#define vreinterpretq_f32_s8(a) ((float32x4_t)a) +#define vreinterpretq_f64_s8(a) ((float64x2_t)a) +#define vreinterpretq_f16_u8(a) ((float16x8_t)a) +#define vreinterpretq_f32_u8(a) ((float32x4_t)a) +#define vreinterpretq_f64_u8(a) ((float64x2_t)a) + +// vreinterpret 16 +#define vreinterpret_s8_s16(a) ((int8x8_t)a) +#define vreinterpret_s32_s16(a) ((int32x2_t)a) +#define vreinterpret_s64_s16(a) ((int64x1_t)a) +#define vreinterpret_u8_u16(a) ((uint8x8_t)a) +#define vreinterpret_u32_u16(a) ((uint32x2_t)a) +#define vreinterpret_u64_u16(a) ((uint64x1_t)a) +#define vreinterpret_f16_s16(a) ((float16x4_t)a) +#define vreinterpret_f32_s16(a) ((float32x2_t)a) +#define vreinterpret_f64_s16(a) ((float64x1_t)a) +#define vreinterpret_f16_u16(a) ((float16x4_t)a) +#define vreinterpret_f32_u16(a) ((float32x2_t)a) +#define vreinterpret_f64_u16(a) ((float64x1_t)a) +#define vreinterpretq_s8_s16(a) ((int16x8_t)a) +#define vreinterpretq_s32_s16(a) ((int32x4_t)a) +#define vreinterpretq_s64_s16(a) ((int64x2_t)a) +#define vreinterpretq_u8_u16(a) ((uint16x8_t)a) +#define vreinterpretq_u32_u16(a) ((uint32x4_t)a) +#define vreinterpretq_u64_u16(a) ((uint64x2_t)a) +#define vreinterpretq_f16_s16(a) ((float16x8_t)a) +#define vreinterpretq_f32_s16(a) ((float32x4_t)a) +#define vreinterpretq_f64_s16(a) ((float64x2_t)a) +#define vreinterpretq_f16_u16(a) ((float16x8_t)a) +#define vreinterpretq_f32_u16(a) ((float32x4_t)a) +#define vreinterpretq_f64_u16(a) ((float64x2_t)a) + +// vreinterpret 32 +#define vreinterpret_s8_s32(a) ((int8x8_t)a) +#define vreinterpret_s16_s32(a) ((int16x4_t)a) +#define vreinterpret_s64_s32(a) ((int64x1_t)a) +#define vreinterpret_u8_u32(a) ((uint8x8_t)a) +#define vreinterpret_u16_u32(a) ((uint16x4_t)a) +#define vreinterpret_u64_u32(a) ((uint64x1_t)a) +#define vreinterpret_f16_s32(a) ((float16x4_t)a) +#define vreinterpret_f32_s32(a) ((float32x2_t)a) +#define vreinterpret_f64_s32(a) ((float64x1_t)a) +#define vreinterpret_f16_u32(a) ((float16x4_t)a) +#define vreinterpret_f32_u32(a) ((float32x2_t)a) +#define vreinterpret_f64_u32(a) ((float64x1_t)a) +#define vreinterpretq_s8_s32(a) ((int16x8_t)a) +#define vreinterpretq_s16_s32(a) ((int16x8_t)a) +#define vreinterpretq_s64_s32(a) ((int64x2_t)a) +#define vreinterpretq_u8_u32(a) ((uint16x8_t)a) +#define vreinterpretq_u16_u32(a) ((uint16x8_t)a) +#define vreinterpretq_u64_u32(a) ((uint64x2_t)a) +#define vreinterpretq_f16_s32(a) ((float16x8_t)a) +#define vreinterpretq_f32_s32(a) ((float32x4_t)a) +#define vreinterpretq_f64_s32(a) ((float64x2_t)a) +#define vreinterpretq_f16_u32(a) ((float16x8_t)a) +#define vreinterpretq_f32_u32(a) ((float32x4_t)a) +#define vreinterpretq_f64_u32(a) ((float64x2_t)a) + +// vreinterpret 64 +#define vreinterpret_s8_s64(a) ((int8x8_t)a) +#define vreinterpret_s16_s64(a) ((int16x4_t)a) +#define vreinterpret_s32_s64(a) ((int32x2_t)a) +#define vreinterpret_u8_u64(a) ((uint8x8_t)a) +#define vreinterpret_u16_u64(a) ((uint16x4_t)a) +#define vreinterpret_u32_u64(a) ((uint32x2_t)a) +#define vreinterpret_f16_s64(a) ((float16x4_t)a) +#define vreinterpret_f32_s64(a) ((float32x2_t)a) +#define vreinterpret_f64_s64(a) ((float64x1_t)a) +#define vreinterpret_f16_u64(a) ((float16x4_t)a) +#define vreinterpret_f32_u64(a) ((float32x2_t)a) +#define vreinterpret_f64_u64(a) ((float64x1_t)a) +#define vreinterpretq_s8_s64(a) ((int8x16_t)a) +#define vreinterpretq_s16_s64(a) ((int16x8_t)a) +#define vreinterpretq_s32_s64(a) ((int32x4_t)a) +#define vreinterpretq_u8_u64(a) ((uint8x16_t)a) +#define vreinterpretq_u16_u64(a) ((uint16x8_t)a) #define vreinterpretq_u32_u64(a) ((uint32x4_t)a) -#define vrev32q_u8(vec) __builtin_mpl_vector_reverse_v16u8(vec) -#define vsetq_lane_u32(value, vec, lane) \ - __builtin_mpl_vector_set_element_v4u32(value, vec, lane) -#define vsetq_lane_u16(value, vec, lane) \ - __builtin_mpl_vector_set_element_v8u16(value, vec, lane) -#define vshlq_u16(a, b) __builtin_mpl_vector_shl_v8u16(a, b) // (a << b) -#define vshlq_n_u64(a, n) __builtin_mpl_vector_shli_v2u64(a, n) // (a << n) -#define vshrq_n_u64(a, n) __builtin_mpl_vector_shri_v2u64(a, n) // (a >> n) -#define vst1q_s32(ptr, val) __builtin_mpl_vector_store_v4i32(ptr, val) -#define vst1q_u8(ptr, val) __builtin_mpl_vector_store_v16u8(ptr, val) +#define vreinterpretq_f16_s64(a) ((float16x8_t)a) +#define vreinterpretq_f32_s64(a) ((float32x4_t)a) +#define vreinterpretq_f64_s64(a) ((float64x2_t)a) +#define vreinterpretq_f16_u64(a) ((float16x8_t)a) +#define vreinterpretq_f32_u64(a) ((float32x4_t)a) +#define vreinterpretq_f64_u64(a) ((float64x2_t)a) + +// vrev32 +#define vrev32_s8(a) __builtin_mpl_vector_reverse_v8i8(a) +#define vrev32_s16(a) __builtin_mpl_vector_reverse_v4i16(a) +#define vrev32_u8(a) __builtin_mpl_vector_reverse_v8u8(a) +#define vrev32_u16(a) __builtin_mpl_vector_reverse_v4u16(a) +#define vrev32q_s8(a) __builtin_mpl_vector_reverse_v16i8(a) +#define vrev32q_s16(a) __builtin_mpl_vector_reverse_v8i16(a) +#define vrev32q_u8(a) __builtin_mpl_vector_reverse_v16u8(a) +#define vrev32q_u16(a) __builtin_mpl_vector_reverse_v8u16(a) + +// vset_lane +#define vset_lane_s8(v, a, n) __builtin_mpl_vector_set_element_v8i8(v, a, n) +#define vset_lane_s16(v, a, n) __builtin_mpl_vector_set_element_v4i16(v, a, n) +#define vset_lane_s32(v, a, n) __builtin_mpl_vector_set_element_v2i32(v, a, n) +#define vset_lane_s64(v, a, n) __builtin_mpl_vector_set_element_v1i64(v, a, n) +#define vset_lane_u8(v, a, n) __builtin_mpl_vector_set_element_v8u8(v, a, n) +#define vset_lane_u16(v, a, n) __builtin_mpl_vector_set_element_v4u16(v, a, n) +#define vset_lane_u32(v, a, n) __builtin_mpl_vector_set_element_v2u32(v, a, n) +#define vset_lane_u64(v, a, n) __builtin_mpl_vector_set_element_v1u64(v, a, n) +#define vset_lane_f16(v, a, n) __builtin_mpl_vector_set_element_v4f16(v, a, n) +#define vset_lane_f32(v, a, n) __builtin_mpl_vector_set_element_v2f32(v, a, n) +#define vset_lane_f64(v, a, n) __builtin_mpl_vector_set_element_v1f64(v, a, n) +#define vsetq_lane_s8(v, a, n) __builtin_mpl_vector_set_element_v16i8(v, a, n) +#define vsetq_lane_s16(v, a, n) __builtin_mpl_vector_set_element_v8i16(v, a, n) +#define vsetq_lane_s32(v, a, n) __builtin_mpl_vector_set_element_v4i32(v, a, n) +#define vsetq_lane_s64(v, a, n) __builtin_mpl_vector_set_element_v2i64(v, a, n) +#define vsetq_lane_u8(v, a, n) __builtin_mpl_vector_set_element_v16u8(v, a, n) +#define vsetq_lane_u16(v, a, n) __builtin_mpl_vector_set_element_v8u16(v, a, n) +#define vsetq_lane_u32(v, a, n) __builtin_mpl_vector_set_element_v4u32(v, a, n) +#define vsetq_lane_u64(v, a, n) __builtin_mpl_vector_set_element_v2u64(v, a, n) +#define vsetq_lane_f16(v, a, n) __builtin_mpl_vector_set_element_v8f16(v, a, n) +#define vsetq_lane_f32(v, a, n) __builtin_mpl_vector_set_element_v4f32(v, a, n) +#define vsetq_lane_f64(v, a, n) __builtin_mpl_vector_set_element_v2f64(v, a, n) + +// vshl +#define vshl_s8(a, b) (a << b) +#define vshl_s16(a, b) (a << b) +#define vshl_s32(a, b) (a << b) +#define vshl_s64(a, b) (a << b) +#define vshl_u8(a, b) (a << b) +#define vshl_u16(a, b) (a << b) +#define vshl_u32(a, b) (a << b) +#define vshl_u64(a, b) (a << b) +#define vshlq_s8(a, b) (a << b) +#define vshlq_s16(a, b) (a << b) +#define vshlq_s32(a, b) (a << b) +#define vshlq_s64(a, b) (a << b) +#define vshlq_u8(a, b) (a << b) +#define vshlq_u16(a, b) (a << b) +#define vshlq_u32(a, b) (a << b) +#define vshlq_u64(a, b) (a << b) + +// vshl_n +#define vshl_n_s8(a, n) (a << n) +#define vshl_n_s16(a, n) (a << n) +#define vshl_n_s32(a, n) (a << n) +#define vshl_n_s64(a, n) (a << n) +#define vshl_n_u8(a, n) (a << n) +#define vshl_n_u16(a, n) (a << n) +#define vshl_n_u32(a, n) (a << n) +#define vshl_n_u64(a, n) (a << n) +#define vshlq_n_s8(a, n) (a << n) +#define vshlq_n_s16(a, n) (a << n) +#define vshlq_n_s32(a, n) (a << n) +#define vshlq_n_s64(a, n) (a << n) +#define vshlq_n_u8(a, n) (a << n) +#define vshlq_n_u16(a, n) (a << n) +#define vshlq_n_u32(a, n) (a << n) +#define vshlq_n_u64(a, n) (a << n) + +// vshr +#define vshr_s8(a, b) (a >> b) +#define vshr_s16(a, b) (a >> b) +#define vshr_s32(a, b) (a >> b) +#define vshr_s64(a, b) (a >> b) +#define vshr_u8(a, b) (a >> b) +#define vshr_u16(a, b) (a >> b) +#define vshr_u32(a, b) (a >> b) +#define vshr_u64(a, b) (a >> b) +#define vshrq_s8(a, b) (a >> b) +#define vshrq_s16(a, b) (a >> b) +#define vshrq_s32(a, b) (a >> b) +#define vshrq_s64(a, b) (a >> b) +#define vshrq_u8(a, b) (a >> b) +#define vshrq_u16(a, b) (a >> b) +#define vshrq_u32(a, b) (a >> b) +#define vshrq_u64(a, b) (a >> b) + +// vshr_n +#define vshr_n_s8(a, n) (a >> n) +#define vshr_n_s16(a, n) (a >> n) +#define vshr_n_s32(a, n) (a >> n) +#define vshr_n_s64(a, n) (a >> n) +#define vshr_n_u8(a, n) (a >> n) +#define vshr_n_u16(a, n) (a >> n) +#define vshr_n_u32(a, n) (a >> n) +#define vshr_n_u64(a, n) (a >> n) +#define vshrq_n_s8(a, n) (a >> n) +#define vshrq_n_s16(a, n) (a >> n) +#define vshrq_n_s32(a, n) (a >> n) +#define vshrq_n_s64(a, n) (a >> n) +#define vshrq_n_u8(a, n) (a >> n) +#define vshrq_n_u16(a, n) (a >> n) +#define vshrq_n_u32(a, n) (a >> n) +#define vshrq_n_u64(a, n) (a >> n) + +// vst1 +#define vst1_s8(p, v) __builtin_mpl_vector_store_v8i8(p, v) +#define vst1_s16(p, v) __builtin_mpl_vector_store_v4i16(p, v) +#define vst1_s32(p, v) __builtin_mpl_vector_store_v2i32(p, v) +#define vst1_s64(p, v) __builtin_mpl_vector_store_v1i64(p, v) +#define vst1_u8(p, v) __builtin_mpl_vector_store_v8u8(p, v) +#define vst1_u16(p, v) __builtin_mpl_vector_store_v4u16(p, v) +#define vst1_u32(p, v) __builtin_mpl_vector_store_v2u32(p, v) +#define vst1_u64(p, v) __builtin_mpl_vector_store_v1u64(p, v) +#define vst1_f16(p, v) __builtin_mpl_vector_store_v4f16(p, v) +#define vst1_f32(p, v) __builtin_mpl_vector_store_v2f32(p, v) +#define vst1_f64(p, v) __builtin_mpl_vector_store_v1f64(p, v) +#define vst1q_s8(p, v) __builtin_mpl_vector_store_v16i8(p, v) +#define vst1q_s16(p, v) __builtin_mpl_vector_store_v8i16(p, v) +#define vst1q_s32(p, v) __builtin_mpl_vector_store_v4i32(p, v) +#define vst1q_s64(p, v) __builtin_mpl_vector_store_v2i64(p, v) +#define vst1q_u8(p, v) __builtin_mpl_vector_store_v16u8(p, v) +#define vst1q_u16(p, v) __builtin_mpl_vector_store_v8u16(p, v) +#define vst1q_u32(p, v) __builtin_mpl_vector_store_v4u32(p, v) +#define vst1q_u64(p, v) __builtin_mpl_vector_store_v2u64(p, v) +#define vst1q_f16(p, v) __builtin_mpl_vector_store_v8f16(p, v) +#define vst1q_f32(p, v) __builtin_mpl_vector_store_v4f32(p, v) +#define vst1q_f64(p, v) __builtin_mpl_vector_store_v2f64(p, v) + +// vsub +#define vsub_s8(a, b) (a - b) +#define vsub_s16(a, b) (a - b) +#define vsub_s32(a, b) (a - b) +#define vsub_s64(a, b) (a - b) +#define vsub_u8(a, b) (a - b) +#define vsub_u16(a, b) (a - b) +#define vsub_u32(a, b) (a - b) +#define vsub_u64(a, b) (a - b) +#define vsub_f16(a, b) (a - b) +#define vsub_f32(a, b) (a - b) +#define vsub_f64(a, b) (a - b) +#define vsubq_s8(a, b) (a - b) +#define vsubq_s16(a, b) (a - b) +#define vsubq_s32(a, b) (a - b) +#define vsubq_s64(a, b) (a - b) +#define vsubq_u8(a, b) (a - b) +#define vsubq_u16(a, b) (a - b) +#define vsubq_u32(a, b) (a - b) +#define vsubq_u64(a, b) (a - b) +#define vsubq_f16(a, b) (a - b) +#define vsubq_f32(a, b) (a - b) +#define vsubq_f64(a, b) (a - b) #endif /* __ARM_NEON_H */ -- Gitee From a91a06d6bde1a1af22a24e0828440a6063d5fbab Mon Sep 17 00:00:00 2001 From: Brice Dobry Date: Sun, 4 Jul 2021 09:39:00 -0400 Subject: [PATCH 2/2] Update tests to use i64/u64 for 1-elem vectors Also removes testing of intrinsics which have been removed. --- test/vector.c | 84 +++++++++++++++------------------------------------ 1 file changed, 24 insertions(+), 60 deletions(-) diff --git a/test/vector.c b/test/vector.c index 4e668de..8d79841 100644 --- a/test/vector.c +++ b/test/vector.c @@ -50,7 +50,7 @@ void intrinsics() { // CHECK-NEXT: var %vec_int32x4 v4i32 int32x4_t vec_int32x4; // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: var %vec_int64x1 f64 + // CHECK-NEXT: var %vec_int64x1 i64 int64x1_t vec_int64x1; // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: var %vec_int64x2 v2i64 @@ -74,7 +74,7 @@ void intrinsics() { // CHECK-NEXT: var %vec_uint32x4 v4u32 uint32x4_t vec_uint32x4; // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: var %vec_uint64x1 f64 + // CHECK-NEXT: var %vec_uint64x1 u64 uint64x1_t vec_uint64x1; // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: var %vec_uint64x2 v2u64 @@ -124,7 +124,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_from_scalar_v4f32 (dread f32 %scalar_float32)) vec_float32x4 = __builtin_mpl_vector_from_scalar_v4f32(scalar_float32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_from_scalar_v1i64 (dread i64 %scalar_int64)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_from_scalar_v1i64 (dread i64 %scalar_int64)) vec_int64x1 = __builtin_mpl_vector_from_scalar_v1i64(scalar_int64); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_from_scalar_v2i32 (dread i32 %scalar_int32)) @@ -136,7 +136,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_from_scalar_v8i8 (dread i32 %scalar_int8)) vec_int8x8 = __builtin_mpl_vector_from_scalar_v8i8(scalar_int8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_from_scalar_v1u64 (dread u64 %scalar_uint64)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_from_scalar_v1u64 (dread u64 %scalar_uint64)) vec_uint64x1 = __builtin_mpl_vector_from_scalar_v1u64(scalar_uint64); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_from_scalar_v2u32 (dread u32 %scalar_uint32)) @@ -223,7 +223,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_merge_v4f32 (dread v4f32 %vec_float32x4, dread v4f32 %vec_float32x4, dread i32 %scalar_int32)) vec_float32x4 = __builtin_mpl_vector_merge_v4f32(vec_float32x4, vec_float32x4, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_merge_v1i64 (dread f64 %vec_int64x1, dread f64 %vec_int64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_merge_v1i64 (dread i64 %vec_int64x1, dread i64 %vec_int64x1, dread i32 %scalar_int32)) vec_int64x1 = __builtin_mpl_vector_merge_v1i64(vec_int64x1, vec_int64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_merge_v2i32 (dread v2i32 %vec_int32x2, dread v2i32 %vec_int32x2, dread i32 %scalar_int32)) @@ -235,7 +235,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_merge_v8i8 (dread v8i8 %vec_int8x8, dread v8i8 %vec_int8x8, dread i32 %scalar_int32)) vec_int8x8 = __builtin_mpl_vector_merge_v8i8(vec_int8x8, vec_int8x8, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_merge_v1u64 (dread f64 %vec_uint64x1, dread f64 %vec_uint64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_merge_v1u64 (dread u64 %vec_uint64x1, dread u64 %vec_uint64x1, dread i32 %scalar_int32)) vec_uint64x1 = __builtin_mpl_vector_merge_v1u64(vec_uint64x1, vec_uint64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_merge_v2u32 (dread v2u32 %vec_uint32x2, dread v2u32 %vec_uint32x2, dread i32 %scalar_int32)) @@ -254,7 +254,7 @@ void intrinsics() { vec_float32x2 = __builtin_mpl_vector_merge_v2f32(vec_float32x2, vec_float32x2, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_get_low_v2i64 (dread v2i64 %vec_int64x2)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_get_low_v2i64 (dread v2i64 %vec_int64x2)) vec_int64x1 = __builtin_mpl_vector_get_low_v2i64(vec_int64x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_get_low_v4i32 (dread v4i32 %vec_int32x4)) @@ -266,7 +266,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_get_low_v16i8 (dread v16i8 %vec_int8x16)) vec_int8x8 = __builtin_mpl_vector_get_low_v16i8(vec_int8x16); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_get_low_v2u64 (dread v2u64 %vec_uint64x2)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_get_low_v2u64 (dread v2u64 %vec_uint64x2)) vec_uint64x1 = __builtin_mpl_vector_get_low_v2u64(vec_uint64x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_get_low_v4u32 (dread v4u32 %vec_uint32x4)) @@ -285,7 +285,7 @@ void intrinsics() { vec_float32x2 = __builtin_mpl_vector_get_low_v4f32(vec_float32x4); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_get_high_v2i64 (dread v2i64 %vec_int64x2)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_get_high_v2i64 (dread v2i64 %vec_int64x2)) vec_int64x1 = __builtin_mpl_vector_get_high_v2i64(vec_int64x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_get_high_v4i32 (dread v4i32 %vec_int32x4)) @@ -297,7 +297,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_get_high_v16i8 (dread v16i8 %vec_int8x16)) vec_int8x8 = __builtin_mpl_vector_get_high_v16i8(vec_int8x16); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_get_high_v2u64 (dread v2u64 %vec_uint64x2)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_get_high_v2u64 (dread v2u64 %vec_uint64x2)) vec_uint64x1 = __builtin_mpl_vector_get_high_v2u64(vec_uint64x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_get_high_v4u32 (dread v4u32 %vec_uint32x4)) @@ -346,7 +346,7 @@ void intrinsics() { // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_get_element_v4f32 (dread v4f32 %vec_float32x4, dread i32 %scalar_int32)) scalar_float32 = __builtin_mpl_vector_get_element_v4f32(vec_float32x4, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_get_element_v1i64 (dread f64 %vec_int64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_get_element_v1i64 (dread i64 %vec_int64x1, dread i32 %scalar_int32)) scalar_int64 = __builtin_mpl_vector_get_element_v1i64(vec_int64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %scalar_int32 0 (intrinsicop i32 vector_get_element_v2i32 (dread v2i32 %vec_int32x2, dread i32 %scalar_int32)) @@ -358,7 +358,7 @@ void intrinsics() { // CHECK-NEXT: dassign %scalar_int8 0 (intrinsicop i8 vector_get_element_v8i8 (dread v8i8 %vec_int8x8, dread i32 %scalar_int32)) scalar_int8 = __builtin_mpl_vector_get_element_v8i8(vec_int8x8, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_get_element_v1u64 (dread f64 %vec_uint64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_get_element_v1u64 (dread u64 %vec_uint64x1, dread i32 %scalar_int32)) scalar_uint64 = __builtin_mpl_vector_get_element_v1u64(vec_uint64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %scalar_uint32 0 (intrinsicop u32 vector_get_element_v2u32 (dread v2u32 %vec_uint32x2, dread i32 %scalar_int32)) @@ -407,7 +407,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_set_element_v4f32 (dread f32 %scalar_float32, dread v4f32 %vec_float32x4, dread i32 %scalar_int32)) vec_float32x4 = __builtin_mpl_vector_set_element_v4f32(scalar_float32, vec_float32x4, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_set_element_v1i64 (dread i64 %scalar_int64, dread f64 %vec_int64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_set_element_v1i64 (dread i64 %scalar_int64, dread i64 %vec_int64x1, dread i32 %scalar_int32)) vec_int64x1 = __builtin_mpl_vector_set_element_v1i64(scalar_int64, vec_int64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_set_element_v2i32 (dread i32 %scalar_int32, dread v2i32 %vec_int32x2, dread i32 %scalar_int32)) @@ -419,7 +419,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_set_element_v8i8 (dread i32 %scalar_int8, dread v8i8 %vec_int8x8, dread i32 %scalar_int32)) vec_int8x8 = __builtin_mpl_vector_set_element_v8i8(scalar_int8, vec_int8x8, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_set_element_v1u64 (dread u64 %scalar_uint64, dread f64 %vec_uint64x1, dread i32 %scalar_int32)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_set_element_v1u64 (dread u64 %scalar_uint64, dread u64 %vec_uint64x1, dread i32 %scalar_int32)) vec_uint64x1 = __builtin_mpl_vector_set_element_v1u64(scalar_uint64, vec_uint64x1, scalar_int32); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_set_element_v2u32 (dread u32 %scalar_uint32, dread v2u32 %vec_uint32x2, dread i32 %scalar_int32)) @@ -456,7 +456,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_pairwise_add_v16u8 (dread v16u8 %vec_uint8x16)) vec_uint16x8 = __builtin_mpl_vector_pairwise_add_v16u8(vec_uint8x16); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_pairwise_add_v2i32 (dread v2i32 %vec_int32x2)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_pairwise_add_v2i32 (dread v2i32 %vec_int32x2)) vec_int64x1 = __builtin_mpl_vector_pairwise_add_v2i32(vec_int32x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_pairwise_add_v4i16 (dread v4i16 %vec_int16x4)) @@ -465,7 +465,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int16x4 0 (intrinsicop v4i16 vector_pairwise_add_v8i8 (dread v8i8 %vec_int8x8)) vec_int16x4 = __builtin_mpl_vector_pairwise_add_v8i8(vec_int8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_pairwise_add_v2u32 (dread v2u32 %vec_uint32x2)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_pairwise_add_v2u32 (dread v2u32 %vec_uint32x2)) vec_uint64x1 = __builtin_mpl_vector_pairwise_add_v2u32(vec_uint32x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_pairwise_add_v4u16 (dread v4u16 %vec_uint16x4)) @@ -505,7 +505,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_reverse_v4f32 (dread v4f32 %vec_float32x4)) vec_float32x4 = __builtin_mpl_vector_reverse_v4f32(vec_float32x4); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_reverse_v1i64 (dread f64 %vec_int64x1)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_reverse_v1i64 (dread i64 %vec_int64x1)) vec_int64x1 = __builtin_mpl_vector_reverse_v1i64(vec_int64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_reverse_v2i32 (dread v2i32 %vec_int32x2)) @@ -517,7 +517,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_reverse_v8i8 (dread v8i8 %vec_int8x8)) vec_int8x8 = __builtin_mpl_vector_reverse_v8i8(vec_int8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_reverse_v1u64 (dread f64 %vec_uint64x1)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_reverse_v1u64 (dread u64 %vec_uint64x1)) vec_uint64x1 = __builtin_mpl_vector_reverse_v1u64(vec_uint64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_reverse_v2u32 (dread v2u32 %vec_uint32x2)) @@ -566,9 +566,6 @@ void intrinsics() { // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_sum_v4f32 (dread v4f32 %vec_float32x4)) scalar_float32 = __builtin_mpl_vector_sum_v4f32(vec_float32x4); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_sum_v1i64 (dread f64 %vec_int64x1)) - scalar_int64 = __builtin_mpl_vector_sum_v1i64(vec_int64x1); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %scalar_int32 0 (intrinsicop i32 vector_sum_v2i32 (dread v2i32 %vec_int32x2)) scalar_int32 = __builtin_mpl_vector_sum_v2i32(vec_int32x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} @@ -578,9 +575,6 @@ void intrinsics() { // CHECK-NEXT: dassign %scalar_int8 0 (intrinsicop i8 vector_sum_v8i8 (dread v8i8 %vec_int8x8)) scalar_int8 = __builtin_mpl_vector_sum_v8i8(vec_int8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_sum_v1u64 (dread f64 %vec_uint64x1)) - scalar_uint64 = __builtin_mpl_vector_sum_v1u64(vec_uint64x1); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %scalar_uint32 0 (intrinsicop u32 vector_sum_v2u32 (dread v2u32 %vec_uint32x2)) scalar_uint32 = __builtin_mpl_vector_sum_v2u32(vec_uint32x2); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} @@ -590,9 +584,6 @@ void intrinsics() { // CHECK-NEXT: dassign %scalar_uint8 0 (intrinsicop u8 vector_sum_v8u8 (dread v8u8 %vec_uint8x8)) scalar_uint8 = __builtin_mpl_vector_sum_v8u8(vec_uint8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %scalar_float64 0 (intrinsicop f64 vector_sum_v1f64 (dread f64 %vec_float64x1)) - scalar_float64 = __builtin_mpl_vector_sum_v1f64(vec_float64x1); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_sum_v2f32 (dread v2f32 %vec_float32x2)) scalar_float32 = __builtin_mpl_vector_sum_v2f32(vec_float32x2); @@ -627,7 +618,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_table_lookup_v4f32 (dread v4f32 %vec_float32x4, dread v4f32 %vec_float32x4)) vec_float32x4 = __builtin_mpl_vector_table_lookup_v4f32(vec_float32x4, vec_float32x4); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_table_lookup_v1i64 (dread f64 %vec_int64x1, dread f64 %vec_int64x1)) + // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_table_lookup_v1i64 (dread i64 %vec_int64x1, dread i64 %vec_int64x1)) vec_int64x1 = __builtin_mpl_vector_table_lookup_v1i64(vec_int64x1, vec_int64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_table_lookup_v2i32 (dread v2i32 %vec_int32x2, dread v2i32 %vec_int32x2)) @@ -639,7 +630,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_table_lookup_v8i8 (dread v8i8 %vec_int8x8, dread v8i8 %vec_int8x8)) vec_int8x8 = __builtin_mpl_vector_table_lookup_v8i8(vec_int8x8, vec_int8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_table_lookup_v1u64 (dread f64 %vec_uint64x1, dread f64 %vec_uint64x1)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_table_lookup_v1u64 (dread u64 %vec_uint64x1, dread u64 %vec_uint64x1)) vec_uint64x1 = __builtin_mpl_vector_table_lookup_v1u64(vec_uint64x1, vec_uint64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_table_lookup_v2u32 (dread v2u32 %vec_uint32x2, dread v2u32 %vec_uint32x2)) @@ -688,7 +679,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_float32x4 0 (iread v4f32 <* f32> 0 (dread a64 %ptr)) vec_float32x4 = __builtin_mpl_vector_load_v4f32(ptr); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int64x1 0 (iread f64 <* i64> 0 (dread a64 %ptr)) + // CHECK-NEXT: dassign %vec_int64x1 0 (iread i64 <* i64> 0 (dread a64 %ptr)) vec_int64x1 = __builtin_mpl_vector_load_v1i64(ptr); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_int32x2 0 (iread v2i32 <* i32> 0 (dread a64 %ptr)) @@ -700,7 +691,7 @@ void intrinsics() { // CHECK-NEXT: dassign %vec_int8x8 0 (iread v8i8 <* i8> 0 (dread a64 %ptr)) vec_int8x8 = __builtin_mpl_vector_load_v8i8(ptr); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x1 0 (iread f64 <* u64> 0 (dread a64 %ptr)) + // CHECK-NEXT: dassign %vec_uint64x1 0 (iread u64 <* u64> 0 (dread a64 %ptr)) vec_uint64x1 = __builtin_mpl_vector_load_v1u64(ptr); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: dassign %vec_uint32x2 0 (iread v2u32 <* u32> 0 (dread a64 %ptr)) @@ -749,7 +740,7 @@ void intrinsics() { // CHECK-NEXT: iassign <* f32> 0 (dread a64 %ptr, dread v4f32 %vec_float32x4) __builtin_mpl_vector_store_v4f32(ptr, vec_float32x4); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: iassign <* i64> 0 (dread a64 %ptr, dread f64 %vec_int64x1) + // CHECK-NEXT: iassign <* i64> 0 (dread a64 %ptr, dread i64 %vec_int64x1) __builtin_mpl_vector_store_v1i64(ptr, vec_int64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: iassign <* i32> 0 (dread a64 %ptr, dread v2i32 %vec_int32x2) @@ -761,7 +752,7 @@ void intrinsics() { // CHECK-NEXT: iassign <* i8> 0 (dread a64 %ptr, dread v8i8 %vec_int8x8) __builtin_mpl_vector_store_v8i8(ptr, vec_int8x8); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: iassign <* u64> 0 (dread a64 %ptr, dread f64 %vec_uint64x1) + // CHECK-NEXT: iassign <* u64> 0 (dread a64 %ptr, dread u64 %vec_uint64x1) __builtin_mpl_vector_store_v1u64(ptr, vec_uint64x1); // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: iassign <* u32> 0 (dread a64 %ptr, dread v2u32 %vec_uint32x2) @@ -786,31 +777,4 @@ void intrinsics() { // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} // CHECK-NEXT: retype v2u64 (dread v16u8 %vec_uint8x16) (uint64x2_t)vec_uint8x16; - - // ****** Temporary builtins: These will be replaced with standard ops - - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_and_v8u16 (dread v8u16 %vec_uint16x8, dread v8u16 %vec_uint16x8)) - vec_uint16x8 = __builtin_mpl_vector_and_v8u16(vec_uint16x8, vec_uint16x8); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_int32x4 0 (intrinsicop v4i32 vector_and_v4i32 (dread v4i32 %vec_int32x4, dread v4i32 %vec_int32x4)) - vec_int32x4 = __builtin_mpl_vector_and_v4i32(vec_int32x4, vec_int32x4); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_eq_v8u16 (dread v8u16 %vec_uint16x8, dread v8u16 %vec_uint16x8)) - vec_uint16x8 = __builtin_mpl_vector_eq_v8u16(vec_uint16x8, vec_uint16x8); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint32x4 0 (intrinsicop v4u32 vector_xor_v4u32 (dread v4u32 %vec_uint32x4, dread v4u32 %vec_uint32x4)) - vec_uint32x4 = __builtin_mpl_vector_xor_v4u32(vec_uint32x4, vec_uint32x4); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_xor_v2u64 (dread v2u64 %vec_uint64x2, dread v2u64 %vec_uint64x2)) - vec_uint64x2 = __builtin_mpl_vector_xor_v2u64(vec_uint64x2, vec_uint64x2); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_shl_v8u16 (dread v8u16 %vec_uint16x8, dread v8i16 %vec_int16x8)) - vec_uint16x8 = __builtin_mpl_vector_shl_v8u16(vec_uint16x8, vec_int16x8); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_shli_v2u64 (dread v2u64 %vec_uint64x2, constval i32 2)) - vec_uint64x2 = __builtin_mpl_vector_shli_v2u64(vec_uint64x2, 2); - // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}} - // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_shri_v2u64 (dread v2u64 %vec_uint64x2, constval i32 2)) - vec_uint64x2 = __builtin_mpl_vector_shri_v2u64(vec_uint64x2, 2); } -- Gitee