From 9111bf0b9354c87f4dd897ee9cd1cb35a6431acb Mon Sep 17 00:00:00 2001
From: Alfred Huang <ahuang@futurewei.com>
Date: Sat, 3 Jul 2021 16:31:11 -0700
Subject: [PATCH 1/2] Enhance arm_neon.h for all datatypes

Relevant intrinsics are defined for all datatypes. Revert back to using
u64/i64 for the single-element vector type.s
---
 Clang2MapleVisitor.cpp |   4 +-
 sys/include/arm_neon.h | 668 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 627 insertions(+), 45 deletions(-)

diff --git a/Clang2MapleVisitor.cpp b/Clang2MapleVisitor.cpp
index 1b8cb11..0b69e20 100644
--- a/Clang2MapleVisitor.cpp
+++ b/Clang2MapleVisitor.cpp
@@ -3602,7 +3602,7 @@ TyIdx Clang2MapleVisitor::type2MplIdx(clang::QualType QT, bool needComplete) {
     switch (ElemTy->GetPrimType()) {
     case PTY_i64:
       if (NumElements == 1) {
-        TI = GlobalTables::GetTypeTable().GetPrimType(PTY_f64)->GetTypeIndex();
+        TI = GlobalTables::GetTypeTable().GetPrimType(PTY_i64)->GetTypeIndex();
       } else if (NumElements == 2) {
         TI =
             GlobalTables::GetTypeTable().GetPrimType(PTY_v2i64)->GetTypeIndex();
@@ -3650,7 +3650,7 @@ TyIdx Clang2MapleVisitor::type2MplIdx(clang::QualType QT, bool needComplete) {
       break;
     case PTY_u64:
       if (NumElements == 1) {
-        TI = GlobalTables::GetTypeTable().GetPrimType(PTY_f64)->GetTypeIndex();
+        TI = GlobalTables::GetTypeTable().GetPrimType(PTY_u64)->GetTypeIndex();
       } else if (NumElements == 2) {
         TI =
             GlobalTables::GetTypeTable().GetPrimType(PTY_v2u64)->GetTypeIndex();
diff --git a/sys/include/arm_neon.h b/sys/include/arm_neon.h
index ddeeb48..35e0146 100644
--- a/sys/include/arm_neon.h
+++ b/sys/include/arm_neon.h
@@ -260,15 +260,12 @@ uint16_t __builtin_mpl_vector_sum_v8u16(uint16x8_t);
 uint8_t __builtin_mpl_vector_sum_v16u8(uint8x16_t);
 float64_t __builtin_mpl_vector_sum_v2f64(float64x2_t);
 float32_t __builtin_mpl_vector_sum_v4f32(float32x4_t);
-int64_t __builtin_mpl_vector_sum_v1i64(int64x1_t);
 int32_t __builtin_mpl_vector_sum_v2i32(int32x2_t);
 int16_t __builtin_mpl_vector_sum_v4i16(int16x4_t);
 int8_t __builtin_mpl_vector_sum_v8i8(int8x8_t);
-uint64_t __builtin_mpl_vector_sum_v1u64(uint64x1_t);
 uint32_t __builtin_mpl_vector_sum_v2u32(uint32x2_t);
 uint16_t __builtin_mpl_vector_sum_v4u16(uint16x4_t);
 uint8_t __builtin_mpl_vector_sum_v8u8(uint8x8_t);
-float64_t __builtin_mpl_vector_sum_v1f64(float64x1_t);
 float32_t __builtin_mpl_vector_sum_v2f32(float32x2_t);
 
 // vecTy table_lookup(vecTy tbl, vecTy idx)
@@ -340,56 +337,641 @@ void __builtin_mpl_vector_store_v8u8(uint8_t *, uint8x8_t);
 void __builtin_mpl_vector_store_v1f64(float64_t *, float64x1_t);
 void __builtin_mpl_vector_store_v2f32(float32_t *, float32x2_t);
 
-// Temporary builtins that should be replaced by standard ops.
-uint16x8_t __builtin_mpl_vector_and_v8u16(uint16x8_t, uint16x8_t);
-int32x4_t __builtin_mpl_vector_and_v4i32(int32x4_t, int32x4_t);
-uint16x8_t __builtin_mpl_vector_eq_v8u16(uint16x8_t, uint16x8_t);
-uint16x8_t __builtin_mpl_vector_shl_v8u16(uint16x8_t, int16x8_t);
-uint64x2_t __builtin_mpl_vector_shli_v2u64(uint64x2_t, const int);
-uint64x2_t __builtin_mpl_vector_shri_v2u64(uint64x2_t, const int);
-uint32x4_t __builtin_mpl_vector_xor_v4u32(uint32x4_t, uint32x4_t);
-uint64x2_t __builtin_mpl_vector_xor_v2u64(uint64x2_t, uint64x2_t);
+// *************************
+// Supported Neon Intrinsics
+// *************************
 
+// vaddv
+#define vaddv_s8(a)  __builtin_mpl_vector_sum_v8i8(a)
+#define vaddv_s16(a) __builtin_mpl_vector_sum_v4i16(a)
+#define vaddv_s32(a) __builtin_mpl_vector_sum_v2i32(a)
+#define vaddv_u8 (a) __builtin_mpl_vector_sum_v8u8(a)
+#define vaddv_u16(a) __builtin_mpl_vector_sum_v4u16(a)
+#define vaddv_u32(a) __builtin_mpl_vector_sum_v2u32(a)
+#define vaddv_f32(a) __builtin_mpl_vector_sum_v2f32(a)
+#define vaddvq_s8(a) __builtin_mpl_vector_sum_v16i8(a)
+#define vaddvq_s16(a) __builtin_mpl_vector_sum_v8i16(a)
+#define vaddvq_s32(a) __builtin_mpl_vector_sum_v4i32(a)
+#define vaddvq_s64(a) __builtin_mpl_vector_sum_v2i64(a)
+#define vaddvq_u8(a) __builtin_mpl_vector_sum_v16u8(a)
 #define vaddvq_u16(a) __builtin_mpl_vector_sum_v8u16(a)
-#define vandq_u16(a, b) __builtin_mpl_vector_and_v8u16(a, b) // (a & b)
-#define vandq_s32(a, b) __builtin_mpl_vector_and_v4i32(a, b) // (a & b)
-#define vdupq_n_s32(value) __builtin_mpl_vector_from_scalar_v4i32(value)
-#define vdupq_n_u16(value) __builtin_mpl_vector_from_scalar_v8u16(value)
-#define vdupq_n_u8(value) __builtin_mpl_vector_from_scalar_v16u8(value)
-#define vdup_n_u32(value) __builtin_mpl_vector_from_scalar_v2u32(value)
-#define vceqq_u16(a, b) __builtin_mpl_vector_eq_v8u16(a, b)  // (a == b)
-#define veorq_u32(a, b) __builtin_mpl_vector_xor_v4u32(a, b) // (a ^ b)
-#define veorq_u64(a, b) __builtin_mpl_vector_xor_v2u64(a, b) // (a ^ b)
+#define vaddvq_u32(a) __builtin_mpl_vector_sum_v4u32(a)
+#define vaddvq_u64(a) __builtin_mpl_vector_sum_v2u64(a)
+#define vaddvq_f32(a) __builtin_mpl_vector_sum_v4f32(a)
+#define vaddvq_f64(a) __builtin_mpl_vector_sum_v2f64(a)
+
+// vadd
+#define vadd_s8(a, b) (a + b)
+#define vadd_s16(a, b) (a + b)
+#define vadd_s32(a, b) (a + b)
+#define vadd_s64(a, b) (a + b)
+#define vadd_u8(a, b) (a + b)
+#define vadd_u16(a, b) (a + b)
+#define vadd_u32(a, b) (a + b)
+#define vadd_u64(a, b) (a + b)
+#define vadd_f16(a, b) (a + b)
+#define vadd_f32(a, b) (a + b)
+#define vadd_f64(a, b) (a + b)
+#define vaddq_s8(a, b) (a + b)
+#define vaddq_s16(a, b) (a + b)
+#define vaddq_s32(a, b) (a + b)
+#define vaddq_s64(a, b) (a + b)
+#define vaddq_u8(a, b) (a + b)
+#define vaddq_u16(a, b) (a + b)
+#define vaddq_u32(a, b) (a + b)
+#define vaddq_u64(a, b) (a + b)
+#define vaddq_f16(a, b) (a + b)
+#define vaddq_f32(a, b) (a + b)
+#define vaddq_f64(a, b) (a + b)
+
+// vand
+#define vand_s8(a, b) (a & b)
+#define vand_s16(a, b) (a & b)
+#define vand_s32(a, b) (a & b)
+#define vand_s64(a, b) (a & b)
+#define vand_u8(a, b) (a & b)
+#define vand_u16(a, b) (a & b)
+#define vand_u32(a, b) (a & b)
+#define vand_u64(a, b) (a & b)
+#define vandq_s8(a, b) (a & b)
+#define vandq_s16(a, b) (a & b)
+#define vandq_s32(a, b) (a & b)
+#define vandq_s64(a, b) (a & b)
+#define vandq_u8(a, b) (a & b)
+#define vandq_u16(a, b) (a & b)
+#define vandq_u32(a, b) (a & b)
+#define vandq_u64(a, b) (a & b)
+
+// vdup
+#define vdup_n_s8(a)  __builtin_mpl_vector_from_scalar_v8i8(a)
+#define vdup_n_s16(a) __builtin_mpl_vector_from_scalar_v4i16(a)
+#define vdup_n_s32(a) __builtin_mpl_vector_from_scalar_v2i32(a)
+#define vdup_n_s64(a) __builtin_mpl_vector_from_scalar_v1i64(a)
+#define vdup_n_u8(a) __builtin_mpl_vector_from_scalar_v8u8(a)
+#define vdup_n_u16(a) __builtin_mpl_vector_from_scalar_v4u16(a)
+#define vdup_n_u32(a) __builtin_mpl_vector_from_scalar_v2u32(a)
+#define vdup_n_u64(a) __builtin_mpl_vector_from_scalar_v1u64(a)
+#define vdup_n_f16(a) __builtin_mpl_vector_from_scalar_v4f16(a)
+#define vdup_n_f32(a) __builtin_mpl_vector_from_scalar_v2f32(a)
+#define vdup_n_f64(a) __builtin_mpl_vector_from_scalar_v1f64(a)
+#define vdupq_n_s8(a) __builtin_mpl_vector_from_scalar_v16i8(a)
+#define vdupq_n_s16(a) __builtin_mpl_vector_from_scalar_v8i16(a)
+#define vdupq_n_s32(a) __builtin_mpl_vector_from_scalar_v4i32(a)
+#define vdupq_n_s64(a) __builtin_mpl_vector_from_scalar_v2i64(a)
+#define vdupq_n_u8(a) __builtin_mpl_vector_from_scalar_v16u8(a)
+#define vdupq_n_u16(a) __builtin_mpl_vector_from_scalar_v8u16(a)
+#define vdupq_n_u32(a) __builtin_mpl_vector_from_scalar_v4u32(a)
+#define vdupq_n_u64(a) __builtin_mpl_vector_from_scalar_v2u64(a)
+#define vdupq_n_f16(a) __builtin_mpl_vector_from_scalar_v8f16(a)
+#define vdupq_n_f32(a) __builtin_mpl_vector_from_scalar_v4f32(a)
+#define vdupq_n_f64(a) __builtin_mpl_vector_from_scalar_v2f64(a)
+
+// vceq
+#define vceq_s8(a, b) (a == b)
+#define vceq_s16(a, b) (a == b)
+#define vceq_s32(a, b) (a == b)
+#define vceq_s64(a, b) (a == b)
+#define vceq_u8(a, b) (a == b)
+#define vceq_u16(a, b) (a == b)
+#define vceq_u32(a, b) (a == b)
+#define vceq_u64(a, b) (a == b)
+#define vceq_f16(a, b) (a == b)
+#define vceq_f32(a, b) (a == b)
+#define vceq_f64(a, b) (a == b)
+#define vceqq_s8(a, b) (a == b)
+#define vceqq_s16(a, b) (a == b)
+#define vceqq_s32(a, b) (a == b)
+#define vceqq_s64(a, b) (a == b)
+#define vceqq_u8(a, b) (a == b)
+#define vceqq_u16(a, b) (a == b)
+#define vceqq_u32(a, b) (a == b)
+#define vceqq_u64(a, b) (a == b)
+#define vceqq_f16(a, b) (a == b)
+#define vceqq_f32(a, b) (a == b)
+#define vceqq_f64(a, b) (a == b)
+
+// vcgt
+#define vcgt_s8(a, b) (a > b)
+#define vcgt_s16(a, b) (a > b)
+#define vcgt_s32(a, b) (a > b)
+#define vcgt_s64(a, b) (a > b)
+#define vcgt_u8(a, b) (a > b)
+#define vcgt_u16(a, b) (a > b)
+#define vcgt_u32(a, b) (a > b)
+#define vcgt_u64(a, b) (a > b)
+#define vcgt_f16(a, b) (a > b)
+#define vcgt_f32(a, b) (a > b)
+#define vcgt_f64(a, b) (a > b)
+#define vcgtq_s8(a, b) (a > b)
+#define vcgtq_s16(a, b) (a > b)
+#define vcgtq_s32(a, b) (a > b)
+#define vcgtq_s64(a, b) (a > b)
+#define vcgtq_u8(a, b) (a > b)
+#define vcgtq_u16(a, b) (a > b)
+#define vcgtq_u32(a, b) (a > b)
+#define vcgtq_u64(a, b) (a > b)
+#define vcgtq_f16(a, b) (a > b)
+#define vcgtq_f32(a, b) (a > b)
+#define vcgtq_f64(a, b) (a > b)
+
+// vcge
+#define vcge_s8(a, b) (a >= b)
+#define vcge_s16(a, b) (a >= b)
+#define vcge_s32(a, b) (a >= b)
+#define vcge_s64(a, b) (a >= b)
+#define vcge_u8(a, b) (a >= b)
+#define vcge_u16(a, b) (a >= b)
+#define vcge_u32(a, b) (a >= b)
+#define vcge_u64(a, b) (a >= b)
+#define vcge_f16(a, b) (a >= b)
+#define vcge_f32(a, b) (a >= b)
+#define vcge_f64(a, b) (a >= b)
+#define vcgeq_s8(a, b) (a >= b)
+#define vcgeq_s16(a, b) (a >= b)
+#define vcgeq_s32(a, b) (a >= b)
+#define vcgeq_s64(a, b) (a >= b)
+#define vcgeq_u8(a, b) (a >= b)
+#define vcgeq_u16(a, b) (a >= b)
+#define vcgeq_u32(a, b) (a >= b)
+#define vcgeq_u64(a, b) (a >= b)
+#define vcgeq_f16(a, b) (a >= b)
+#define vcgeq_f32(a, b) (a >= b)
+#define vcgeq_f64(a, b) (a >= b)
+
+// vclt
+#define vclt_s8(a, b) (a < b)
+#define vclt_s16(a, b) (a < b)
+#define vclt_s32(a, b) (a < b)
+#define vclt_s64(a, b) (a < b)
+#define vclt_u8(a, b) (a < b)
+#define vclt_u16(a, b) (a < b)
+#define vclt_u32(a, b) (a < b)
+#define vclt_u64(a, b) (a < b)
+#define vclt_f16(a, b) (a < b)
+#define vclt_f32(a, b) (a < b)
+#define vclt_f64(a, b) (a < b)
+#define vcltq_s8(a, b) (a < b)
+#define vcltq_s16(a, b) (a < b)
+#define vcltq_s32(a, b) (a < b)
+#define vcltq_s64(a, b) (a < b)
+#define vcltq_u8(a, b) (a < b)
+#define vcltq_u16(a, b) (a < b)
+#define vcltq_u32(a, b) (a < b)
+#define vcltq_u64(a, b) (a < b)
+#define vcltq_f16(a, b) (a < b)
+#define vcltq_f32(a, b) (a < b)
+#define vcltq_f64(a, b) (a < b)
+
+// vcle
+#define vcle_s8(a, b) (a <= b)
+#define vcle_s16(a, b) (a <= b)
+#define vcle_s32(a, b) (a <= b)
+#define vcle_s64(a, b) (a <= b)
+#define vcle_u8(a, b) (a <= b)
+#define vcle_u16(a, b) (a <= b)
+#define vcle_u32(a, b) (a <= b)
+#define vcle_u64(a, b) (a <= b)
+#define vcle_f16(a, b) (a <= b)
+#define vcle_f32(a, b) (a <= b)
+#define vcle_f64(a, b) (a <= b)
+#define vcleq_s8(a, b) (a <= b)
+#define vcleq_s16(a, b) (a <= b)
+#define vcleq_s32(a, b) (a <= b)
+#define vcleq_s64(a, b) (a <= b)
+#define vcleq_u8(a, b) (a <= b)
+#define vcleq_u16(a, b) (a <= b)
+#define vcleq_u32(a, b) (a <= b)
+#define vcleq_u64(a, b) (a <= b)
+#define vcleq_f16(a, b) (a <= b)
+#define vcleq_f32(a, b) (a <= b)
+#define vcleq_f64(a, b) (a <= b)
+
+// veor
+#define veor_s8(a, b) (a ^ b)
+#define veor_s16(a, b) (a ^ b)
+#define veor_s32(a, b) (a ^ b)
+#define veor_s64(a, b) (a ^ b)
+#define veor_u8(a, b) (a ^ b)
+#define veor_u16(a, b) (a ^ b)
+#define veor_u32(a, b) (a ^ b)
+#define veor_u64(a, b) (a ^ b)
+#define veorq_s8(a, b) (a ^ b)
+#define veorq_s16(a, b) (a ^ b)
+#define veorq_s32(a, b) (a ^ b)
+#define veorq_s64(a, b) (a ^ b)
+#define veorq_u8(a, b) (a ^ b)
+#define veorq_u16(a, b) (a ^ b)
+#define veorq_u32(a, b) (a ^ b)
+#define veorq_u64(a, b) (a ^ b)
+
+// vext
+#define vext_s8(a, b, n)  __builtin_mpl_vector_merge_v8i8(a, b, n)
+#define vext_s16(a, b, n) __builtin_mpl_vector_merge_v4i16(a, b, n)
+#define vext_s32(a, b, n) __builtin_mpl_vector_merge_v2i32(a, b, n)
+#define vext_s64(a, b, n) __builtin_mpl_vector_merge_v1i64(a, b, n)
+#define vext_u8(a, b, n) __builtin_mpl_vector_merge_v8u8(a, b, n)
+#define vext_u16(a, b, n) __builtin_mpl_vector_merge_v4u16(a, b, n)
+#define vext_u32(a, b, n) __builtin_mpl_vector_merge_v2u32(a, b, n)
+#define vext_u64(a, b, n) __builtin_mpl_vector_merge_v1u64(a, b, n)
+#define vext_f16(a, b, n) __builtin_mpl_vector_merge_v4f16(a, b, n)
+#define vext_f32(a, b, n) __builtin_mpl_vector_merge_v2f32(a, b, n)
+#define vext_f64(a, b, n) __builtin_mpl_vector_merge_v1f64(a, b, n)
+#define vextq_s8(a, b, n) __builtin_mpl_vector_merge_v16i8(a, b, n)
+#define vextq_s16(a, b, n) __builtin_mpl_vector_merge_v8i16(a, b, n)
+#define vextq_s32(a, b, n) __builtin_mpl_vector_merge_v4i32(a, b, n)
+#define vextq_s64(a, b, n) __builtin_mpl_vector_merge_v2i64(a, b, n)
 #define vextq_u8(a, b, n) __builtin_mpl_vector_merge_v16u8(a, b, n)
 #define vextq_u16(a, b, n) __builtin_mpl_vector_merge_v8u16(a, b, n)
+#define vextq_u32(a, b, n) __builtin_mpl_vector_merge_v4u32(a, b, n)
+#define vextq_u64(a, b, n) __builtin_mpl_vector_merge_v2u64(a, b, n)
+#define vextq_f16(a, b, n) __builtin_mpl_vector_merge_v8f16(a, b, n)
+#define vextq_f32(a, b, n) __builtin_mpl_vector_merge_v4f32(a, b, n)
+#define vextq_f64(a, b, n) __builtin_mpl_vector_merge_v2f64(a, b, n)
+
+// vget_high
+#define vget_high_s8(a)  __builtin_mpl_vector_get_high_v16i8(a)
+#define vget_high_s16(a) __builtin_mpl_vector_get_high_v8i16(a)
+#define vget_high_s32(a) __builtin_mpl_vector_get_high_v4i32(a)
+#define vget_high_s64(a) __builtin_mpl_vector_get_high_v2i64(a)
+#define vget_high_u8(a) __builtin_mpl_vector_get_high_v16u8(a)
+#define vget_high_u16(a) __builtin_mpl_vector_get_high_v8u16(a)
+#define vget_high_u32(a) __builtin_mpl_vector_get_high_v4u32(a)
 #define vget_high_u64(a) __builtin_mpl_vector_get_high_v2u64(a)
+#define vget_high_f16(a) __builtin_mpl_vector_get_high_v4f16(a)
+#define vget_high_f32(a) __builtin_mpl_vector_get_high_v2f32(a)
+#define vget_high_f64(a) __builtin_mpl_vector_get_high_v1f64(a)
+
+// vget_lane
+#define vget_lane_s8(a, n)  __builtin_mpl_vector_get_element_v8i8(a, n)
+#define vget_lane_s16(a, n) __builtin_mpl_vector_get_element_v4i16(a, n)
+#define vget_lane_s32(a, n) __builtin_mpl_vector_get_element_v2i32(a, n)
+#define vget_lane_s64(a, n) __builtin_mpl_vector_get_element_v1i64(a, n)
+#define vget_lane_u8(a, n) __builtin_mpl_vector_get_element_v8u8(a, n)
+#define vget_lane_u16(a, n) __builtin_mpl_vector_get_element_v4u16(a, n)
+#define vget_lane_u32(a, n) __builtin_mpl_vector_get_element_v2u32(a, n)
+#define vget_lane_u64(a, n) __builtin_mpl_vector_get_element_v1u64(a, n)
+#define vget_lane_f16(a, n) __builtin_mpl_vector_get_element_v4f16(a, n)
+#define vget_lane_f32(a, n) __builtin_mpl_vector_get_element_v2f32(a, n)
+#define vget_lane_f64(a, n) __builtin_mpl_vector_get_element_v1f64(a, n)
+#define vgetq_lane_s8(a, n) __builtin_mpl_vector_get_element_v16i8(a, n)
+#define vgetq_lane_s16(a, n) __builtin_mpl_vector_get_element_v8i16(a, n)
+#define vgetq_lane_s32(a, n) __builtin_mpl_vector_get_element_v4i32(a, n)
+#define vgetq_lane_s64(a, n) __builtin_mpl_vector_get_element_v2i64(a, n)
+#define vgetq_lane_u8(a, n) __builtin_mpl_vector_get_element_v16u8(a, n)
+#define vgetq_lane_u16(a, n) __builtin_mpl_vector_get_element_v8u16(a, n)
+#define vgetq_lane_u32(a, n) __builtin_mpl_vector_get_element_v4u32(a, n)
+#define vgetq_lane_u64(a, n) __builtin_mpl_vector_get_element_v2u64(a, n)
+#define vgetq_lane_f16(a, n) __builtin_mpl_vector_get_element_v8f16(a, n)
+#define vgetq_lane_f32(a, n) __builtin_mpl_vector_get_element_v4f32(a, n)
+#define vgetq_lane_f64(a, n) __builtin_mpl_vector_get_element_v2f64(a, n)
+
+// vget_low
+#define vget_low_s8(a)  __builtin_mpl_vector_get_low_v16i8(a)
+#define vget_low_s16(a) __builtin_mpl_vector_get_low_v8i16(a)
+#define vget_low_s32(a) __builtin_mpl_vector_get_low_v4i32(a)
+#define vget_low_s64(a) __builtin_mpl_vector_get_low_v2i64(a)
+#define vget_low_u8(a) __builtin_mpl_vector_get_low_v16u8(a)
+#define vget_low_u16(a) __builtin_mpl_vector_get_low_v8u16(a)
+#define vget_low_u32(a) __builtin_mpl_vector_get_low_v4u32(a)
 #define vget_low_u64(a) __builtin_mpl_vector_get_low_v2u64(a)
-#define vget_lane_u32(vec, lane)                                               \
-  __builtin_mpl_vector_get_element_v2u32(vec, lane)
-#define vgetq_lane_u32(vec, lane)                                              \
-  __builtin_mpl_vector_get_element_v4u32(vec, lane)
-#define vgetq_lane_u16(vec, lane)                                              \
-  __builtin_mpl_vector_get_element_v8u16(vec, lane)
-#define vld1q_u8(ptr) __builtin_mpl_vector_load_v16u8(ptr)
-#define vld1q_u16(ptr) __builtin_mpl_vector_load_v8u16(ptr)
-#define vld1q_s32(ptr) __builtin_mpl_vector_load_v4i32(ptr)
-#define vld1q_u32(ptr) __builtin_mpl_vector_load_v4u32(ptr)
-#define vmlal_u32(accum, s1, s2) __builtin_mpl_vector_madd_v2u32(accum, s1, s2)
+#define vget_low_f16(a) __builtin_mpl_vector_get_low_v4f16(a)
+#define vget_low_f32(a) __builtin_mpl_vector_get_low_v2f32(a)
+#define vget_low_f64(a) __builtin_mpl_vector_get_low_v1f64(a)
+
+// vld1
+#define vld1_s8(a)  __builtin_mpl_vector_load_v8i8(a)
+#define vld1_s16(a) __builtin_mpl_vector_load_v4i16(a)
+#define vld1_s32(a) __builtin_mpl_vector_load_v2i32(a)
+#define vld1_s64(a) __builtin_mpl_vector_load_v1i64(a)
+#define vld1_u8(a) __builtin_mpl_vector_load_v8u8(a)
+#define vld1_u16(a) __builtin_mpl_vector_load_v4u16(a)
+#define vld1_u32(a) __builtin_mpl_vector_load_v2u32(a)
+#define vld1_u64(a) __builtin_mpl_vector_load_v1u64(a)
+#define vld1_f16(a) __builtin_mpl_vector_load_v4f16(a)
+#define vld1_f32(a) __builtin_mpl_vector_load_v2f32(a)
+#define vld1_f64(a) __builtin_mpl_vector_load_v1f64(a)
+#define vld1q_s8(a) __builtin_mpl_vector_load_v16i8(a)
+#define vld1q_s16(a) __builtin_mpl_vector_load_v8i16(a)
+#define vld1q_s32(a) __builtin_mpl_vector_load_v4i32(a)
+#define vld1q_s64(a) __builtin_mpl_vector_load_v2i64(a)
+#define vld1q_u8(a) __builtin_mpl_vector_load_v16u8(a)
+#define vld1q_u16(a) __builtin_mpl_vector_load_v8u16(a)
+#define vld1q_u32(a) __builtin_mpl_vector_load_v4u32(a)
+#define vld1q_u64(a) __builtin_mpl_vector_load_v2u64(a)
+#define vld1q_f16(a) __builtin_mpl_vector_load_v8f16(a)
+#define vld1q_f32(a) __builtin_mpl_vector_load_v4f32(a)
+#define vld1q_f64(a) __builtin_mpl_vector_load_v2f64(a)
+
+// vmlal
+#define vmlal_s8(acc, a, b)  __builtin_mpl_vector_madd_v8i8(acc, a, b)
+#define vmlal_s16(acc, a, b) __builtin_mpl_vector_madd_v4i16(acc, a, b)
+#define vmlal_s32(acc, a, b) __builtin_mpl_vector_madd_v2i32(acc, a, b)
+#define vmlal_u8(acc, a, b) __builtin_mpl_vector_madd_v8u8(acc, a, b)
+#define vmlal_u16(acc, a, b) __builtin_mpl_vector_madd_v4u16(acc, a, b)
+#define vmlal_u32(acc, a, b) __builtin_mpl_vector_madd_v2u32(acc, a, b)
+
+// vmull
+#define vmull_s8(a, b)  __builtin_mpl_vector_mul_v8i8(a, b)
+#define vmull_s16(a, b) __builtin_mpl_vector_mul_v4i16(a, b)
+#define vmull_s32(a, b) __builtin_mpl_vector_mul_v2i32(a, b)
+#define vmull_u8(a, b) __builtin_mpl_vector_mul_v8u8(a, b)
+#define vmull_u16(a, b) __builtin_mpl_vector_mul_v4u16(a, b)
 #define vmull_u32(a, b) __builtin_mpl_vector_mul_v2u32(a, b)
+
+// vor
+#define vor_s8(a, b) (a | b)
+#define vor_s16(a, b) (a | b)
+#define vor_s32(a, b) (a | b)
+#define vor_s64(a, b) (a | b)
+#define vor_u8(a, b) (a | b)
+#define vor_u16(a, b) (a | b)
+#define vor_u32(a, b) (a | b)
+#define vor_u64(a, b) (a | b)
+#define vorq_s8(a, b) (a | b)
+#define vorq_s16(a, b) (a | b)
+#define vorq_s32(a, b) (a | b)
+#define vorq_s64(a, b) (a | b)
+#define vorq_u8(a, b) (a | b)
+#define vorq_u16(a, b) (a | b)
+#define vorq_u32(a, b) (a | b)
+#define vorq_u64(a, b) (a | b)
+
+// vpaddl
+#define vpaddl_s8(a)  __builtin_mpl_vector_pairwise_add_v8i8(a)
+#define vpaddl_s16(a) __builtin_mpl_vector_pairwise_add_v4i16(a)
+#define vpaddl_s32(a) __builtin_mpl_vector_pairwise_add_v2i32(a)
+#define vpaddl_u8(a) __builtin_mpl_vector_pairwise_add_v8u8(a)
+#define vpaddl_u16(a) __builtin_mpl_vector_pairwise_add_v4u16(a)
+#define vpaddl_u32(a) __builtin_mpl_vector_pairwise_add_v2u32(a)
+#define vpaddlq_s8(a)  __builtin_mpl_vector_pairwise_add_v16i8(a)
+#define vpaddlq_s16(a) __builtin_mpl_vector_pairwise_add_v8i16(a)
+#define vpaddlq_s32(a) __builtin_mpl_vector_pairwise_add_v4i32(a)
+#define vpaddlq_u8(a) __builtin_mpl_vector_pairwise_add_v16u8(a)
 #define vpaddlq_u16(a) __builtin_mpl_vector_pairwise_add_v8u16(a)
 #define vpaddlq_u32(a) __builtin_mpl_vector_pairwise_add_v4u32(a)
-#define vqtbl1q_u8(t, idx) __builtin_mpl_vector_table_lookup_v16u8(t, idx)
+
+// vqtbl1
+#define vqtbl1_s8(a, b) __builtin_mpl_vector_table_lookup_v8i8(a, b)
+#define vqtbl1_u8(a, b) __builtin_mpl_vector_table_lookup_v8u8(a, b)
+#define vqtbl1q_s8(a, b) __builtin_mpl_vector_table_lookup_v16i8(a, b)
+#define vqtbl1q_u8(a, b) __builtin_mpl_vector_table_lookup_v16u8(a, b)
+
+// vreinterpret 8
+#define vreinterpret_s16_s8(a) ((int16x4_t)a)
+#define vreinterpret_s32_s8(a) ((int32x2_t)a)
+#define vreinterpret_s64_s8(a) ((int64x1_t)a)
+#define vreinterpret_u16_u8(a) ((uint16x4_t)a)
+#define vreinterpret_u32_u8(a) ((uint32x2_t)a)
+#define vreinterpret_u64_u8(a) ((uint64x1_t)a)
+#define vreinterpret_f16_s8(a) ((float16x4_t)a)
+#define vreinterpret_f32_s8(a) ((float32x2_t)a)
+#define vreinterpret_f64_s8(a) ((float64x1_t)a)
+#define vreinterpret_f16_u8(a) ((float16x4_t)a)
+#define vreinterpret_f32_u8(a) ((float32x2_t)a)
+#define vreinterpret_f64_u8(a) ((float64x1_t)a)
+#define vreinterpretq_s16_s8(a) ((int16x8_t)a)
+#define vreinterpretq_s32_s8(a) ((int32x4_t)a)
+#define vreinterpretq_s64_s8(a) ((int64x2_t)a)
+#define vreinterpretq_u16_u8(a) ((uint16x8_t)a)
+#define vreinterpretq_u32_u8(a) ((uint32x4_t)a)
 #define vreinterpretq_u64_u8(a) ((uint64x2_t)a)
+#define vreinterpretq_f16_s8(a) ((float16x8_t)a)
+#define vreinterpretq_f32_s8(a) ((float32x4_t)a)
+#define vreinterpretq_f64_s8(a) ((float64x2_t)a)
+#define vreinterpretq_f16_u8(a) ((float16x8_t)a)
+#define vreinterpretq_f32_u8(a) ((float32x4_t)a)
+#define vreinterpretq_f64_u8(a) ((float64x2_t)a)
+
+// vreinterpret 16
+#define vreinterpret_s8_s16(a) ((int8x8_t)a)
+#define vreinterpret_s32_s16(a) ((int32x2_t)a)
+#define vreinterpret_s64_s16(a) ((int64x1_t)a)
+#define vreinterpret_u8_u16(a) ((uint8x8_t)a)
+#define vreinterpret_u32_u16(a) ((uint32x2_t)a)
+#define vreinterpret_u64_u16(a) ((uint64x1_t)a)
+#define vreinterpret_f16_s16(a) ((float16x4_t)a)
+#define vreinterpret_f32_s16(a) ((float32x2_t)a)
+#define vreinterpret_f64_s16(a) ((float64x1_t)a)
+#define vreinterpret_f16_u16(a) ((float16x4_t)a)
+#define vreinterpret_f32_u16(a) ((float32x2_t)a)
+#define vreinterpret_f64_u16(a) ((float64x1_t)a)
+#define vreinterpretq_s8_s16(a) ((int16x8_t)a)
+#define vreinterpretq_s32_s16(a) ((int32x4_t)a)
+#define vreinterpretq_s64_s16(a) ((int64x2_t)a)
+#define vreinterpretq_u8_u16(a) ((uint16x8_t)a)
+#define vreinterpretq_u32_u16(a) ((uint32x4_t)a)
+#define vreinterpretq_u64_u16(a) ((uint64x2_t)a)
+#define vreinterpretq_f16_s16(a) ((float16x8_t)a)
+#define vreinterpretq_f32_s16(a) ((float32x4_t)a)
+#define vreinterpretq_f64_s16(a) ((float64x2_t)a)
+#define vreinterpretq_f16_u16(a) ((float16x8_t)a)
+#define vreinterpretq_f32_u16(a) ((float32x4_t)a)
+#define vreinterpretq_f64_u16(a) ((float64x2_t)a)
+
+// vreinterpret 32
+#define vreinterpret_s8_s32(a) ((int8x8_t)a)
+#define vreinterpret_s16_s32(a) ((int16x4_t)a)
+#define vreinterpret_s64_s32(a) ((int64x1_t)a)
+#define vreinterpret_u8_u32(a) ((uint8x8_t)a)
+#define vreinterpret_u16_u32(a) ((uint16x4_t)a)
+#define vreinterpret_u64_u32(a) ((uint64x1_t)a)
+#define vreinterpret_f16_s32(a) ((float16x4_t)a)
+#define vreinterpret_f32_s32(a) ((float32x2_t)a)
+#define vreinterpret_f64_s32(a) ((float64x1_t)a)
+#define vreinterpret_f16_u32(a) ((float16x4_t)a)
+#define vreinterpret_f32_u32(a) ((float32x2_t)a)
+#define vreinterpret_f64_u32(a) ((float64x1_t)a)
+#define vreinterpretq_s8_s32(a) ((int16x8_t)a)
+#define vreinterpretq_s16_s32(a) ((int16x8_t)a)
+#define vreinterpretq_s64_s32(a) ((int64x2_t)a)
+#define vreinterpretq_u8_u32(a) ((uint16x8_t)a)
+#define vreinterpretq_u16_u32(a) ((uint16x8_t)a)
+#define vreinterpretq_u64_u32(a) ((uint64x2_t)a)
+#define vreinterpretq_f16_s32(a) ((float16x8_t)a)
+#define vreinterpretq_f32_s32(a) ((float32x4_t)a)
+#define vreinterpretq_f64_s32(a) ((float64x2_t)a)
+#define vreinterpretq_f16_u32(a) ((float16x8_t)a)
+#define vreinterpretq_f32_u32(a) ((float32x4_t)a)
+#define vreinterpretq_f64_u32(a) ((float64x2_t)a)
+
+// vreinterpret 64
+#define vreinterpret_s8_s64(a) ((int8x8_t)a)
+#define vreinterpret_s16_s64(a) ((int16x4_t)a)
+#define vreinterpret_s32_s64(a) ((int32x2_t)a)
+#define vreinterpret_u8_u64(a) ((uint8x8_t)a)
+#define vreinterpret_u16_u64(a) ((uint16x4_t)a)
+#define vreinterpret_u32_u64(a) ((uint32x2_t)a)
+#define vreinterpret_f16_s64(a) ((float16x4_t)a)
+#define vreinterpret_f32_s64(a) ((float32x2_t)a)
+#define vreinterpret_f64_s64(a) ((float64x1_t)a)
+#define vreinterpret_f16_u64(a) ((float16x4_t)a)
+#define vreinterpret_f32_u64(a) ((float32x2_t)a)
+#define vreinterpret_f64_u64(a) ((float64x1_t)a)
+#define vreinterpretq_s8_s64(a) ((int8x16_t)a)
+#define vreinterpretq_s16_s64(a) ((int16x8_t)a)
+#define vreinterpretq_s32_s64(a) ((int32x4_t)a)
+#define vreinterpretq_u8_u64(a) ((uint8x16_t)a)
+#define vreinterpretq_u16_u64(a) ((uint16x8_t)a)
 #define vreinterpretq_u32_u64(a) ((uint32x4_t)a)
-#define vrev32q_u8(vec) __builtin_mpl_vector_reverse_v16u8(vec)
-#define vsetq_lane_u32(value, vec, lane)                                       \
-  __builtin_mpl_vector_set_element_v4u32(value, vec, lane)
-#define vsetq_lane_u16(value, vec, lane)                                       \
-  __builtin_mpl_vector_set_element_v8u16(value, vec, lane)
-#define vshlq_u16(a, b) __builtin_mpl_vector_shl_v8u16(a, b)   // (a << b)
-#define vshlq_n_u64(a, n) __builtin_mpl_vector_shli_v2u64(a, n) // (a << n)
-#define vshrq_n_u64(a, n) __builtin_mpl_vector_shri_v2u64(a, n) // (a >> n)
-#define vst1q_s32(ptr, val) __builtin_mpl_vector_store_v4i32(ptr, val)
-#define vst1q_u8(ptr, val) __builtin_mpl_vector_store_v16u8(ptr, val)
+#define vreinterpretq_f16_s64(a) ((float16x8_t)a)
+#define vreinterpretq_f32_s64(a) ((float32x4_t)a)
+#define vreinterpretq_f64_s64(a) ((float64x2_t)a)
+#define vreinterpretq_f16_u64(a) ((float16x8_t)a)
+#define vreinterpretq_f32_u64(a) ((float32x4_t)a)
+#define vreinterpretq_f64_u64(a) ((float64x2_t)a)
+
+// vrev32
+#define vrev32_s8(a) __builtin_mpl_vector_reverse_v8i8(a)
+#define vrev32_s16(a) __builtin_mpl_vector_reverse_v4i16(a)
+#define vrev32_u8(a) __builtin_mpl_vector_reverse_v8u8(a)
+#define vrev32_u16(a) __builtin_mpl_vector_reverse_v4u16(a)
+#define vrev32q_s8(a) __builtin_mpl_vector_reverse_v16i8(a)
+#define vrev32q_s16(a) __builtin_mpl_vector_reverse_v8i16(a)
+#define vrev32q_u8(a) __builtin_mpl_vector_reverse_v16u8(a)
+#define vrev32q_u16(a) __builtin_mpl_vector_reverse_v8u16(a)
+
+// vset_lane
+#define vset_lane_s8(v, a, n)  __builtin_mpl_vector_set_element_v8i8(v, a, n)
+#define vset_lane_s16(v, a, n) __builtin_mpl_vector_set_element_v4i16(v, a, n)
+#define vset_lane_s32(v, a, n) __builtin_mpl_vector_set_element_v2i32(v, a, n)
+#define vset_lane_s64(v, a, n) __builtin_mpl_vector_set_element_v1i64(v, a, n)
+#define vset_lane_u8(v, a, n) __builtin_mpl_vector_set_element_v8u8(v, a, n)
+#define vset_lane_u16(v, a, n) __builtin_mpl_vector_set_element_v4u16(v, a, n)
+#define vset_lane_u32(v, a, n) __builtin_mpl_vector_set_element_v2u32(v, a, n)
+#define vset_lane_u64(v, a, n) __builtin_mpl_vector_set_element_v1u64(v, a, n)
+#define vset_lane_f16(v, a, n) __builtin_mpl_vector_set_element_v4f16(v, a, n)
+#define vset_lane_f32(v, a, n) __builtin_mpl_vector_set_element_v2f32(v, a, n)
+#define vset_lane_f64(v, a, n) __builtin_mpl_vector_set_element_v1f64(v, a, n)
+#define vsetq_lane_s8(v, a, n) __builtin_mpl_vector_set_element_v16i8(v, a, n)
+#define vsetq_lane_s16(v, a, n) __builtin_mpl_vector_set_element_v8i16(v, a, n)
+#define vsetq_lane_s32(v, a, n) __builtin_mpl_vector_set_element_v4i32(v, a, n)
+#define vsetq_lane_s64(v, a, n) __builtin_mpl_vector_set_element_v2i64(v, a, n)
+#define vsetq_lane_u8(v, a, n) __builtin_mpl_vector_set_element_v16u8(v, a, n)
+#define vsetq_lane_u16(v, a, n) __builtin_mpl_vector_set_element_v8u16(v, a, n)
+#define vsetq_lane_u32(v, a, n) __builtin_mpl_vector_set_element_v4u32(v, a, n)
+#define vsetq_lane_u64(v, a, n) __builtin_mpl_vector_set_element_v2u64(v, a, n)
+#define vsetq_lane_f16(v, a, n) __builtin_mpl_vector_set_element_v8f16(v, a, n)
+#define vsetq_lane_f32(v, a, n) __builtin_mpl_vector_set_element_v4f32(v, a, n)
+#define vsetq_lane_f64(v, a, n) __builtin_mpl_vector_set_element_v2f64(v, a, n)
+
+// vshl
+#define vshl_s8(a, b) (a << b)
+#define vshl_s16(a, b) (a << b)
+#define vshl_s32(a, b) (a << b)
+#define vshl_s64(a, b) (a << b)
+#define vshl_u8(a, b) (a << b)
+#define vshl_u16(a, b) (a << b)
+#define vshl_u32(a, b) (a << b)
+#define vshl_u64(a, b) (a << b)
+#define vshlq_s8(a, b) (a << b)
+#define vshlq_s16(a, b) (a << b)
+#define vshlq_s32(a, b) (a << b)
+#define vshlq_s64(a, b) (a << b)
+#define vshlq_u8(a, b) (a << b)
+#define vshlq_u16(a, b) (a << b)
+#define vshlq_u32(a, b) (a << b)
+#define vshlq_u64(a, b) (a << b)
+
+// vshl_n
+#define vshl_n_s8(a, n) (a << n)
+#define vshl_n_s16(a, n) (a << n)
+#define vshl_n_s32(a, n) (a << n)
+#define vshl_n_s64(a, n) (a << n)
+#define vshl_n_u8(a, n) (a << n)
+#define vshl_n_u16(a, n) (a << n)
+#define vshl_n_u32(a, n) (a << n)
+#define vshl_n_u64(a, n) (a << n)
+#define vshlq_n_s8(a, n) (a << n)
+#define vshlq_n_s16(a, n) (a << n)
+#define vshlq_n_s32(a, n) (a << n)
+#define vshlq_n_s64(a, n) (a << n)
+#define vshlq_n_u8(a, n) (a << n)
+#define vshlq_n_u16(a, n) (a << n)
+#define vshlq_n_u32(a, n) (a << n)
+#define vshlq_n_u64(a, n) (a << n)
+
+// vshr
+#define vshr_s8(a, b) (a >> b)
+#define vshr_s16(a, b) (a >> b)
+#define vshr_s32(a, b) (a >> b)
+#define vshr_s64(a, b) (a >> b)
+#define vshr_u8(a, b) (a >> b)
+#define vshr_u16(a, b) (a >> b)
+#define vshr_u32(a, b) (a >> b)
+#define vshr_u64(a, b) (a >> b)
+#define vshrq_s8(a, b) (a >> b)
+#define vshrq_s16(a, b) (a >> b)
+#define vshrq_s32(a, b) (a >> b)
+#define vshrq_s64(a, b) (a >> b)
+#define vshrq_u8(a, b) (a >> b)
+#define vshrq_u16(a, b) (a >> b)
+#define vshrq_u32(a, b) (a >> b)
+#define vshrq_u64(a, b) (a >> b)
+
+// vshr_n
+#define vshr_n_s8(a, n) (a >> n)
+#define vshr_n_s16(a, n) (a >> n)
+#define vshr_n_s32(a, n) (a >> n)
+#define vshr_n_s64(a, n) (a >> n)
+#define vshr_n_u8(a, n) (a >> n)
+#define vshr_n_u16(a, n) (a >> n)
+#define vshr_n_u32(a, n) (a >> n)
+#define vshr_n_u64(a, n) (a >> n)
+#define vshrq_n_s8(a, n) (a >> n)
+#define vshrq_n_s16(a, n) (a >> n)
+#define vshrq_n_s32(a, n) (a >> n)
+#define vshrq_n_s64(a, n) (a >> n)
+#define vshrq_n_u8(a, n) (a >> n)
+#define vshrq_n_u16(a, n) (a >> n)
+#define vshrq_n_u32(a, n) (a >> n)
+#define vshrq_n_u64(a, n) (a >> n)
+
+// vst1
+#define vst1_s8(p, v)  __builtin_mpl_vector_store_v8i8(p, v)
+#define vst1_s16(p, v) __builtin_mpl_vector_store_v4i16(p, v)
+#define vst1_s32(p, v) __builtin_mpl_vector_store_v2i32(p, v)
+#define vst1_s64(p, v) __builtin_mpl_vector_store_v1i64(p, v)
+#define vst1_u8(p, v) __builtin_mpl_vector_store_v8u8(p, v)
+#define vst1_u16(p, v) __builtin_mpl_vector_store_v4u16(p, v)
+#define vst1_u32(p, v) __builtin_mpl_vector_store_v2u32(p, v)
+#define vst1_u64(p, v) __builtin_mpl_vector_store_v1u64(p, v)
+#define vst1_f16(p, v) __builtin_mpl_vector_store_v4f16(p, v)
+#define vst1_f32(p, v) __builtin_mpl_vector_store_v2f32(p, v)
+#define vst1_f64(p, v) __builtin_mpl_vector_store_v1f64(p, v)
+#define vst1q_s8(p, v) __builtin_mpl_vector_store_v16i8(p, v)
+#define vst1q_s16(p, v) __builtin_mpl_vector_store_v8i16(p, v)
+#define vst1q_s32(p, v) __builtin_mpl_vector_store_v4i32(p, v)
+#define vst1q_s64(p, v) __builtin_mpl_vector_store_v2i64(p, v)
+#define vst1q_u8(p, v) __builtin_mpl_vector_store_v16u8(p, v)
+#define vst1q_u16(p, v) __builtin_mpl_vector_store_v8u16(p, v)
+#define vst1q_u32(p, v) __builtin_mpl_vector_store_v4u32(p, v)
+#define vst1q_u64(p, v) __builtin_mpl_vector_store_v2u64(p, v)
+#define vst1q_f16(p, v) __builtin_mpl_vector_store_v8f16(p, v)
+#define vst1q_f32(p, v) __builtin_mpl_vector_store_v4f32(p, v)
+#define vst1q_f64(p, v) __builtin_mpl_vector_store_v2f64(p, v)
+
+// vsub
+#define vsub_s8(a, b) (a - b)
+#define vsub_s16(a, b) (a - b)
+#define vsub_s32(a, b) (a - b)
+#define vsub_s64(a, b) (a - b)
+#define vsub_u8(a, b) (a - b)
+#define vsub_u16(a, b) (a - b)
+#define vsub_u32(a, b) (a - b)
+#define vsub_u64(a, b) (a - b)
+#define vsub_f16(a, b) (a - b)
+#define vsub_f32(a, b) (a - b)
+#define vsub_f64(a, b) (a - b)
+#define vsubq_s8(a, b) (a - b)
+#define vsubq_s16(a, b) (a - b)
+#define vsubq_s32(a, b) (a - b)
+#define vsubq_s64(a, b) (a - b)
+#define vsubq_u8(a, b) (a - b)
+#define vsubq_u16(a, b) (a - b)
+#define vsubq_u32(a, b) (a - b)
+#define vsubq_u64(a, b) (a - b)
+#define vsubq_f16(a, b) (a - b)
+#define vsubq_f32(a, b) (a - b)
+#define vsubq_f64(a, b) (a - b)
 
 #endif /* __ARM_NEON_H */
-- 
Gitee


From a91a06d6bde1a1af22a24e0828440a6063d5fbab Mon Sep 17 00:00:00 2001
From: Brice Dobry <brice.dobry@futurewei.com>
Date: Sun, 4 Jul 2021 09:39:00 -0400
Subject: [PATCH 2/2] Update tests to use i64/u64 for 1-elem vectors

Also removes testing of intrinsics which have been removed.
---
 test/vector.c | 84 +++++++++++++++------------------------------------
 1 file changed, 24 insertions(+), 60 deletions(-)

diff --git a/test/vector.c b/test/vector.c
index 4e668de..8d79841 100644
--- a/test/vector.c
+++ b/test/vector.c
@@ -50,7 +50,7 @@ void intrinsics() {
   // CHECK-NEXT: var %vec_int32x4 v4i32
   int32x4_t vec_int32x4;
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: var %vec_int64x1 f64
+  // CHECK-NEXT: var %vec_int64x1 i64
   int64x1_t vec_int64x1;
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: var %vec_int64x2 v2i64
@@ -74,7 +74,7 @@ void intrinsics() {
   // CHECK-NEXT: var %vec_uint32x4 v4u32
   uint32x4_t vec_uint32x4;
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: var %vec_uint64x1 f64
+  // CHECK-NEXT: var %vec_uint64x1 u64
   uint64x1_t vec_uint64x1;
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: var %vec_uint64x2 v2u64
@@ -124,7 +124,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_from_scalar_v4f32 (dread f32 %scalar_float32))
   vec_float32x4 = __builtin_mpl_vector_from_scalar_v4f32(scalar_float32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_from_scalar_v1i64 (dread i64 %scalar_int64))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_from_scalar_v1i64 (dread i64 %scalar_int64))
   vec_int64x1 = __builtin_mpl_vector_from_scalar_v1i64(scalar_int64);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_from_scalar_v2i32 (dread i32 %scalar_int32))
@@ -136,7 +136,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_from_scalar_v8i8 (dread i32 %scalar_int8))
   vec_int8x8 = __builtin_mpl_vector_from_scalar_v8i8(scalar_int8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_from_scalar_v1u64 (dread u64 %scalar_uint64))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_from_scalar_v1u64 (dread u64 %scalar_uint64))
   vec_uint64x1 = __builtin_mpl_vector_from_scalar_v1u64(scalar_uint64);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_from_scalar_v2u32 (dread u32 %scalar_uint32))
@@ -223,7 +223,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_merge_v4f32 (dread v4f32 %vec_float32x4, dread v4f32 %vec_float32x4, dread i32 %scalar_int32))
   vec_float32x4 = __builtin_mpl_vector_merge_v4f32(vec_float32x4, vec_float32x4, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_merge_v1i64 (dread f64 %vec_int64x1, dread f64 %vec_int64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_merge_v1i64 (dread i64 %vec_int64x1, dread i64 %vec_int64x1, dread i32 %scalar_int32))
   vec_int64x1 = __builtin_mpl_vector_merge_v1i64(vec_int64x1, vec_int64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_merge_v2i32 (dread v2i32 %vec_int32x2, dread v2i32 %vec_int32x2, dread i32 %scalar_int32))
@@ -235,7 +235,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_merge_v8i8 (dread v8i8 %vec_int8x8, dread v8i8 %vec_int8x8, dread i32 %scalar_int32))
   vec_int8x8 = __builtin_mpl_vector_merge_v8i8(vec_int8x8, vec_int8x8, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_merge_v1u64 (dread f64 %vec_uint64x1, dread f64 %vec_uint64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_merge_v1u64 (dread u64 %vec_uint64x1, dread u64 %vec_uint64x1, dread i32 %scalar_int32))
   vec_uint64x1 = __builtin_mpl_vector_merge_v1u64(vec_uint64x1, vec_uint64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_merge_v2u32 (dread v2u32 %vec_uint32x2, dread v2u32 %vec_uint32x2, dread i32 %scalar_int32))
@@ -254,7 +254,7 @@ void intrinsics() {
   vec_float32x2 = __builtin_mpl_vector_merge_v2f32(vec_float32x2, vec_float32x2, scalar_int32);
 
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_get_low_v2i64 (dread v2i64 %vec_int64x2))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_get_low_v2i64 (dread v2i64 %vec_int64x2))
   vec_int64x1 = __builtin_mpl_vector_get_low_v2i64(vec_int64x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_get_low_v4i32 (dread v4i32 %vec_int32x4))
@@ -266,7 +266,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_get_low_v16i8 (dread v16i8 %vec_int8x16))
   vec_int8x8 = __builtin_mpl_vector_get_low_v16i8(vec_int8x16);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_get_low_v2u64 (dread v2u64 %vec_uint64x2))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_get_low_v2u64 (dread v2u64 %vec_uint64x2))
   vec_uint64x1 = __builtin_mpl_vector_get_low_v2u64(vec_uint64x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_get_low_v4u32 (dread v4u32 %vec_uint32x4))
@@ -285,7 +285,7 @@ void intrinsics() {
   vec_float32x2 = __builtin_mpl_vector_get_low_v4f32(vec_float32x4);
 
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_get_high_v2i64 (dread v2i64 %vec_int64x2))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_get_high_v2i64 (dread v2i64 %vec_int64x2))
   vec_int64x1 = __builtin_mpl_vector_get_high_v2i64(vec_int64x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_get_high_v4i32 (dread v4i32 %vec_int32x4))
@@ -297,7 +297,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_get_high_v16i8 (dread v16i8 %vec_int8x16))
   vec_int8x8 = __builtin_mpl_vector_get_high_v16i8(vec_int8x16);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_get_high_v2u64 (dread v2u64 %vec_uint64x2))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_get_high_v2u64 (dread v2u64 %vec_uint64x2))
   vec_uint64x1 = __builtin_mpl_vector_get_high_v2u64(vec_uint64x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_get_high_v4u32 (dread v4u32 %vec_uint32x4))
@@ -346,7 +346,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_get_element_v4f32 (dread v4f32 %vec_float32x4, dread i32 %scalar_int32))
   scalar_float32 = __builtin_mpl_vector_get_element_v4f32(vec_float32x4, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_get_element_v1i64 (dread f64 %vec_int64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_get_element_v1i64 (dread i64 %vec_int64x1, dread i32 %scalar_int32))
   scalar_int64 = __builtin_mpl_vector_get_element_v1i64(vec_int64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %scalar_int32 0 (intrinsicop i32 vector_get_element_v2i32 (dread v2i32 %vec_int32x2, dread i32 %scalar_int32))
@@ -358,7 +358,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %scalar_int8 0 (intrinsicop i8 vector_get_element_v8i8 (dread v8i8 %vec_int8x8, dread i32 %scalar_int32))
   scalar_int8 = __builtin_mpl_vector_get_element_v8i8(vec_int8x8, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_get_element_v1u64 (dread f64 %vec_uint64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_get_element_v1u64 (dread u64 %vec_uint64x1, dread i32 %scalar_int32))
   scalar_uint64 = __builtin_mpl_vector_get_element_v1u64(vec_uint64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %scalar_uint32 0 (intrinsicop u32 vector_get_element_v2u32 (dread v2u32 %vec_uint32x2, dread i32 %scalar_int32))
@@ -407,7 +407,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_set_element_v4f32 (dread f32 %scalar_float32, dread v4f32 %vec_float32x4, dread i32 %scalar_int32))
   vec_float32x4 = __builtin_mpl_vector_set_element_v4f32(scalar_float32, vec_float32x4, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_set_element_v1i64 (dread i64 %scalar_int64, dread f64 %vec_int64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_set_element_v1i64 (dread i64 %scalar_int64, dread i64 %vec_int64x1, dread i32 %scalar_int32))
   vec_int64x1 = __builtin_mpl_vector_set_element_v1i64(scalar_int64, vec_int64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_set_element_v2i32 (dread i32 %scalar_int32, dread v2i32 %vec_int32x2, dread i32 %scalar_int32))
@@ -419,7 +419,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_set_element_v8i8 (dread i32 %scalar_int8, dread v8i8 %vec_int8x8, dread i32 %scalar_int32))
   vec_int8x8 = __builtin_mpl_vector_set_element_v8i8(scalar_int8, vec_int8x8, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_set_element_v1u64 (dread u64 %scalar_uint64, dread f64 %vec_uint64x1, dread i32 %scalar_int32))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_set_element_v1u64 (dread u64 %scalar_uint64, dread u64 %vec_uint64x1, dread i32 %scalar_int32))
   vec_uint64x1 = __builtin_mpl_vector_set_element_v1u64(scalar_uint64, vec_uint64x1, scalar_int32);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_set_element_v2u32 (dread u32 %scalar_uint32, dread v2u32 %vec_uint32x2, dread i32 %scalar_int32))
@@ -456,7 +456,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_pairwise_add_v16u8 (dread v16u8 %vec_uint8x16))
   vec_uint16x8 = __builtin_mpl_vector_pairwise_add_v16u8(vec_uint8x16);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_pairwise_add_v2i32 (dread v2i32 %vec_int32x2))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_pairwise_add_v2i32 (dread v2i32 %vec_int32x2))
   vec_int64x1 = __builtin_mpl_vector_pairwise_add_v2i32(vec_int32x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_pairwise_add_v4i16 (dread v4i16 %vec_int16x4))
@@ -465,7 +465,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int16x4 0 (intrinsicop v4i16 vector_pairwise_add_v8i8 (dread v8i8 %vec_int8x8))
   vec_int16x4 = __builtin_mpl_vector_pairwise_add_v8i8(vec_int8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_pairwise_add_v2u32 (dread v2u32 %vec_uint32x2))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_pairwise_add_v2u32 (dread v2u32 %vec_uint32x2))
   vec_uint64x1 = __builtin_mpl_vector_pairwise_add_v2u32(vec_uint32x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_pairwise_add_v4u16 (dread v4u16 %vec_uint16x4))
@@ -505,7 +505,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_reverse_v4f32 (dread v4f32 %vec_float32x4))
   vec_float32x4 = __builtin_mpl_vector_reverse_v4f32(vec_float32x4);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_reverse_v1i64 (dread f64 %vec_int64x1))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_reverse_v1i64 (dread i64 %vec_int64x1))
   vec_int64x1 = __builtin_mpl_vector_reverse_v1i64(vec_int64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_reverse_v2i32 (dread v2i32 %vec_int32x2))
@@ -517,7 +517,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_reverse_v8i8 (dread v8i8 %vec_int8x8))
   vec_int8x8 = __builtin_mpl_vector_reverse_v8i8(vec_int8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_reverse_v1u64 (dread f64 %vec_uint64x1))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_reverse_v1u64 (dread u64 %vec_uint64x1))
   vec_uint64x1 = __builtin_mpl_vector_reverse_v1u64(vec_uint64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_reverse_v2u32 (dread v2u32 %vec_uint32x2))
@@ -566,9 +566,6 @@ void intrinsics() {
   // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_sum_v4f32 (dread v4f32 %vec_float32x4))
   scalar_float32 = __builtin_mpl_vector_sum_v4f32(vec_float32x4);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %scalar_int64 0 (intrinsicop i64 vector_sum_v1i64 (dread f64 %vec_int64x1))
-  scalar_int64 = __builtin_mpl_vector_sum_v1i64(vec_int64x1);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %scalar_int32 0 (intrinsicop i32 vector_sum_v2i32 (dread v2i32 %vec_int32x2))
   scalar_int32 = __builtin_mpl_vector_sum_v2i32(vec_int32x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
@@ -578,9 +575,6 @@ void intrinsics() {
   // CHECK-NEXT: dassign %scalar_int8 0 (intrinsicop i8 vector_sum_v8i8 (dread v8i8 %vec_int8x8))
   scalar_int8 = __builtin_mpl_vector_sum_v8i8(vec_int8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %scalar_uint64 0 (intrinsicop u64 vector_sum_v1u64 (dread f64 %vec_uint64x1))
-  scalar_uint64 = __builtin_mpl_vector_sum_v1u64(vec_uint64x1);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %scalar_uint32 0 (intrinsicop u32 vector_sum_v2u32 (dread v2u32 %vec_uint32x2))
   scalar_uint32 = __builtin_mpl_vector_sum_v2u32(vec_uint32x2);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
@@ -590,9 +584,6 @@ void intrinsics() {
   // CHECK-NEXT: dassign %scalar_uint8 0 (intrinsicop u8 vector_sum_v8u8 (dread v8u8 %vec_uint8x8))
   scalar_uint8 = __builtin_mpl_vector_sum_v8u8(vec_uint8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %scalar_float64 0 (intrinsicop f64 vector_sum_v1f64 (dread f64 %vec_float64x1))
-  scalar_float64 = __builtin_mpl_vector_sum_v1f64(vec_float64x1);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %scalar_float32 0 (intrinsicop f32 vector_sum_v2f32 (dread v2f32 %vec_float32x2))
   scalar_float32 = __builtin_mpl_vector_sum_v2f32(vec_float32x2);
 
@@ -627,7 +618,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (intrinsicop v4f32 vector_table_lookup_v4f32 (dread v4f32 %vec_float32x4, dread v4f32 %vec_float32x4))
   vec_float32x4 = __builtin_mpl_vector_table_lookup_v4f32(vec_float32x4, vec_float32x4);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop f64 vector_table_lookup_v1i64 (dread f64 %vec_int64x1, dread f64 %vec_int64x1))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (intrinsicop i64 vector_table_lookup_v1i64 (dread i64 %vec_int64x1, dread i64 %vec_int64x1))
   vec_int64x1 = __builtin_mpl_vector_table_lookup_v1i64(vec_int64x1, vec_int64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (intrinsicop v2i32 vector_table_lookup_v2i32 (dread v2i32 %vec_int32x2, dread v2i32 %vec_int32x2))
@@ -639,7 +630,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (intrinsicop v8i8 vector_table_lookup_v8i8 (dread v8i8 %vec_int8x8, dread v8i8 %vec_int8x8))
   vec_int8x8 = __builtin_mpl_vector_table_lookup_v8i8(vec_int8x8, vec_int8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop f64 vector_table_lookup_v1u64 (dread f64 %vec_uint64x1, dread f64 %vec_uint64x1))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (intrinsicop u64 vector_table_lookup_v1u64 (dread u64 %vec_uint64x1, dread u64 %vec_uint64x1))
   vec_uint64x1 = __builtin_mpl_vector_table_lookup_v1u64(vec_uint64x1, vec_uint64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (intrinsicop v2u32 vector_table_lookup_v2u32 (dread v2u32 %vec_uint32x2, dread v2u32 %vec_uint32x2))
@@ -688,7 +679,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_float32x4 0 (iread v4f32 <* f32> 0 (dread a64 %ptr))
   vec_float32x4 = __builtin_mpl_vector_load_v4f32(ptr);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int64x1 0 (iread f64 <* i64> 0 (dread a64 %ptr))
+  // CHECK-NEXT: dassign %vec_int64x1 0 (iread i64 <* i64> 0 (dread a64 %ptr))
   vec_int64x1 = __builtin_mpl_vector_load_v1i64(ptr);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_int32x2 0 (iread v2i32 <* i32> 0 (dread a64 %ptr))
@@ -700,7 +691,7 @@ void intrinsics() {
   // CHECK-NEXT: dassign %vec_int8x8 0 (iread v8i8 <* i8> 0 (dread a64 %ptr))
   vec_int8x8 = __builtin_mpl_vector_load_v8i8(ptr);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x1 0 (iread f64 <* u64> 0 (dread a64 %ptr))
+  // CHECK-NEXT: dassign %vec_uint64x1 0 (iread u64 <* u64> 0 (dread a64 %ptr))
   vec_uint64x1 = __builtin_mpl_vector_load_v1u64(ptr);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: dassign %vec_uint32x2 0 (iread v2u32 <* u32> 0 (dread a64 %ptr))
@@ -749,7 +740,7 @@ void intrinsics() {
   // CHECK-NEXT: iassign <* f32> 0 (dread a64 %ptr, dread v4f32 %vec_float32x4)
   __builtin_mpl_vector_store_v4f32(ptr, vec_float32x4);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: iassign <* i64> 0 (dread a64 %ptr, dread f64 %vec_int64x1)
+  // CHECK-NEXT: iassign <* i64> 0 (dread a64 %ptr, dread i64 %vec_int64x1)
   __builtin_mpl_vector_store_v1i64(ptr, vec_int64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: iassign <* i32> 0 (dread a64 %ptr, dread v2i32 %vec_int32x2)
@@ -761,7 +752,7 @@ void intrinsics() {
   // CHECK-NEXT: iassign <* i8> 0 (dread a64 %ptr, dread v8i8 %vec_int8x8)
   __builtin_mpl_vector_store_v8i8(ptr, vec_int8x8);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: iassign <* u64> 0 (dread a64 %ptr, dread f64 %vec_uint64x1)
+  // CHECK-NEXT: iassign <* u64> 0 (dread a64 %ptr, dread u64 %vec_uint64x1)
   __builtin_mpl_vector_store_v1u64(ptr, vec_uint64x1);
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: iassign <* u32> 0 (dread a64 %ptr, dread v2u32 %vec_uint32x2)
@@ -786,31 +777,4 @@ void intrinsics() {
   // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
   // CHECK-NEXT: retype v2u64 <v2u64> (dread v16u8 %vec_uint8x16)
   (uint64x2_t)vec_uint8x16;
-
-  // ****** Temporary builtins: These will be replaced with standard ops
-
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_and_v8u16 (dread v8u16 %vec_uint16x8, dread v8u16 %vec_uint16x8))
-  vec_uint16x8 = __builtin_mpl_vector_and_v8u16(vec_uint16x8, vec_uint16x8);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_int32x4 0 (intrinsicop v4i32 vector_and_v4i32 (dread v4i32 %vec_int32x4, dread v4i32 %vec_int32x4))
-  vec_int32x4 = __builtin_mpl_vector_and_v4i32(vec_int32x4, vec_int32x4);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_eq_v8u16 (dread v8u16 %vec_uint16x8, dread v8u16 %vec_uint16x8))
-  vec_uint16x8 = __builtin_mpl_vector_eq_v8u16(vec_uint16x8, vec_uint16x8);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint32x4 0 (intrinsicop v4u32 vector_xor_v4u32 (dread v4u32 %vec_uint32x4, dread v4u32 %vec_uint32x4))
-  vec_uint32x4 = __builtin_mpl_vector_xor_v4u32(vec_uint32x4, vec_uint32x4);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_xor_v2u64 (dread v2u64 %vec_uint64x2, dread v2u64 %vec_uint64x2))
-  vec_uint64x2 = __builtin_mpl_vector_xor_v2u64(vec_uint64x2, vec_uint64x2);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint16x8 0 (intrinsicop v8u16 vector_shl_v8u16 (dread v8u16 %vec_uint16x8, dread v8i16 %vec_int16x8))
-  vec_uint16x8 = __builtin_mpl_vector_shl_v8u16(vec_uint16x8, vec_int16x8);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_shli_v2u64 (dread v2u64 %vec_uint64x2, constval i32 2))
-  vec_uint64x2 = __builtin_mpl_vector_shli_v2u64(vec_uint64x2, 2);
-  // CHECK: LOC [[# FILENUM]] [[# @LINE + 2 ]]{{$}}
-  // CHECK-NEXT: dassign %vec_uint64x2 0 (intrinsicop v2u64 vector_shri_v2u64 (dread v2u64 %vec_uint64x2, constval i32 2))
-  vec_uint64x2 = __builtin_mpl_vector_shri_v2u64(vec_uint64x2, 2);
 }
-- 
Gitee