From a77c52f63c6e5e58b847f5dc883ffbbfc70db380 Mon Sep 17 00:00:00 2001 From: "Hu, Lin1" Date: Fri, 10 Oct 2025 16:38:12 +0800 Subject: [PATCH] Intel: Backport some patches from release/gcc-12 --- ...386-Change-prefetchi-output-template.patch | 62 +++ ...6-Add-non-optimize-prefetchi-intrins.patch | 92 ++++ ...33-i386-Fix-AVX512-intrin-macro-typo.patch | 268 +++++++++++ ...zero_ps-d-instead-of-_mm_avx512_setz.patch | 46 ++ ...t-Bk-to-define_special_memory_constr.patch | 107 +++++ ..._-move_max-store_max-with-vectorizer.patch | 232 +++++++++ INTEL-0037-Fix-testcase-failure.patch | 120 +++++ ...heck-avx-upper-register-for-parallel.patch | 148 ++++++ ...Fix-vfpclassph-non-optimizied-intrin.patch | 134 ++++++ ...as-option-and-reorder-Intel-CPU-marc.patch | 278 +++++++++++ ...-related-to-combine-vpcmpuw-zero_ext.patch | 416 ++++++++++++++++ ...due-to-isa-mismatch-for-the-builtins.patch | 95 ++++ ...43-Fix-ICE-due-to-subreg-us_truncate.patch | 444 ++++++++++++++++++ ...-32-bit-address-to-64-bit-with-optio.patch | 104 ++++ ...ed-operands-2-in-vec_unpacks_hi_v4sf.patch | 37 ++ INTEL-0046-GCC13-GCC12-Fix-testcase.patch | 34 ++ ...047-i386-Remove-CLDEMOTE-for-clients.patch | 54 +++ INTEL-0048-c-signed-__int128_t-PR108099.patch | 59 +++ ...signed-__int128_t-semantics-PR108099.patch | 78 +++ ...-g-.dg-ext-int128-8.C-testcase-PR109.patch | 34 ++ ...gned-typedef-name-extension-PR108099.patch | 172 +++++++ gcc.spec | 48 +- 22 files changed, 3061 insertions(+), 1 deletion(-) create mode 100644 INTEL-0031-i386-Change-prefetchi-output-template.patch create mode 100644 INTEL-0032-i386-Add-non-optimize-prefetchi-intrins.patch create mode 100644 INTEL-0033-i386-Fix-AVX512-intrin-macro-typo.patch create mode 100644 INTEL-0034-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch create mode 100644 INTEL-0035-Refine-constraint-Bk-to-define_special_memory_constr.patch create mode 100644 INTEL-0036-Align-ix86_-move_max-store_max-with-vectorizer.patch create mode 100644 INTEL-0037-Fix-testcase-failure.patch create mode 100644 INTEL-0038-Check-avx-upper-register-for-parallel.patch create mode 100644 INTEL-0039-i386-Fix-vfpclassph-non-optimizied-intrin.patch create mode 100644 INTEL-0040-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch create mode 100644 INTEL-0041-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch create mode 100644 INTEL-0042-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch create mode 100644 INTEL-0043-Fix-ICE-due-to-subreg-us_truncate.patch create mode 100644 INTEL-0044-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch create mode 100644 INTEL-0045-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch create mode 100644 INTEL-0046-GCC13-GCC12-Fix-testcase.patch create mode 100644 INTEL-0047-i386-Remove-CLDEMOTE-for-clients.patch create mode 100644 INTEL-0048-c-signed-__int128_t-PR108099.patch create mode 100644 INTEL-0049-c-fix-unsigned-__int128_t-semantics-PR108099.patch create mode 100644 INTEL-0050-testsuite-Fix-up-g-.dg-ext-int128-8.C-testcase-PR109.patch create mode 100644 INTEL-0051-c-fix-unsigned-typedef-name-extension-PR108099.patch diff --git a/INTEL-0031-i386-Change-prefetchi-output-template.patch b/INTEL-0031-i386-Change-prefetchi-output-template.patch new file mode 100644 index 0000000..314c30a --- /dev/null +++ b/INTEL-0031-i386-Change-prefetchi-output-template.patch @@ -0,0 +1,62 @@ +From 59e07c6c77dcc92d274ca6156b8916f66035dce8 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Mon, 22 Jul 2024 14:06:18 +0800 +Subject: [PATCH 1/2] i386: Change prefetchi output template + +For prefetchi instructions, RIP-relative address is explicitly mentioned +for operand and assembler obeys that rule strictly. This makes +instruction like: + + prefetchit0 bar + +got illegal for assembler, which should be a broad usage for prefetchi. + +Change to %a to explicitly add (%rip) after function label to make it +legal in assembler so that it could pass to linker to get the real address. + +gcc/ChangeLog: + + * config/i386/i386.md (prefetchi): Change to %a. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/prefetchi-1.c: Check (%rip). + +Reference: +https://gcc.gnu.org/git/?p=gcc.git;a=commit;h= +062e46a813799684c6f900815fd22451d6294ae1 +--- + gcc/config/i386/i386.md | 2 +- + gcc/testsuite/gcc.target/i386/prefetchi-1.c | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md +index f08c2cfb1..1b733008e 100644 +--- a/gcc/config/i386/i386.md ++++ b/gcc/config/i386/i386.md +@@ -22917,7 +22917,7 @@ + "TARGET_PREFETCHI && TARGET_64BIT" + { + static const char * const patterns[2] = { +- "prefetchit1\t%0", "prefetchit0\t%0" ++ "prefetchit1\t%a0", "prefetchit0\t%a0" + }; + + int locality = INTVAL (operands[1]); +diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1.c b/gcc/testsuite/gcc.target/i386/prefetchi-1.c +index 80f25e70e..03dfdc55e 100644 +--- a/gcc/testsuite/gcc.target/i386/prefetchi-1.c ++++ b/gcc/testsuite/gcc.target/i386/prefetchi-1.c +@@ -1,7 +1,7 @@ + /* { dg-do compile { target { ! ia32 } } } */ + /* { dg-options "-mprefetchi -O2" } */ +-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+" 2 } } */ +-/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+" 2 } } */ ++/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+bar\\(%rip\\)" 2 } } */ ++/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+bar\\(%rip\\)" 2 } } */ + + #include + +-- +2.31.1 + diff --git a/INTEL-0032-i386-Add-non-optimize-prefetchi-intrins.patch b/INTEL-0032-i386-Add-non-optimize-prefetchi-intrins.patch new file mode 100644 index 0000000..2a450e8 --- /dev/null +++ b/INTEL-0032-i386-Add-non-optimize-prefetchi-intrins.patch @@ -0,0 +1,92 @@ +From c19afda0ee549d294fd5714c63db24bcd4570d03 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Thu, 25 Jul 2024 16:16:05 +0800 +Subject: [PATCH 2/2] i386: Add non-optimize prefetchi intrins + +Under -O0, with the "newly" introduced intrins, the variable will be +transformed as mem instead of the origin symbol_ref. The compiler will +then treat the operand as invalid and turn the operation into nop, which +is not expected. Use macro for non-optimize to keep the variable as +symbol_ref just as how prefetch intrin does. + +gcc/ChangeLog: + + * config/i386/prfchiintrin.h + (_m_prefetchit0): Add macro for non-optimized option. + (_m_prefetchit1): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/prefetchi-1b.c: New test. + +Reference: +https://gcc.gnu.org/git/?p=gcc.git;a=commit;h= +b4524c4430ba9771265bd9fc31e69a3f35dfe117 +--- + gcc/config/i386/prfchiintrin.h | 9 +++++++ + gcc/testsuite/gcc.target/i386/prefetchi-1b.c | 26 ++++++++++++++++++++ + 2 files changed, 35 insertions(+) + create mode 100644 gcc/testsuite/gcc.target/i386/prefetchi-1b.c + +diff --git a/gcc/config/i386/prfchiintrin.h b/gcc/config/i386/prfchiintrin.h +index 06deef488..1e3d42dc3 100644 +--- a/gcc/config/i386/prfchiintrin.h ++++ b/gcc/config/i386/prfchiintrin.h +@@ -30,6 +30,7 @@ + + #ifdef __x86_64__ + ++#ifdef __OPTIMIZE__ + extern __inline void + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _m_prefetchit0 (void* __P) +@@ -43,6 +44,14 @@ _m_prefetchit1 (void* __P) + { + __builtin_ia32_prefetchi (__P, 2); + } ++#else ++#define _m_prefetchit0(P) \ ++ __builtin_ia32_prefetchi(P, 3) ++ ++#define _m_prefetchit1(P) \ ++ __builtin_ia32_prefetchi(P, 2) ++ ++#endif + + #endif + +diff --git a/gcc/testsuite/gcc.target/i386/prefetchi-1b.c b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c +new file mode 100644 +index 000000000..93139554d +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/prefetchi-1b.c +@@ -0,0 +1,26 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-mprefetchi -O0" } */ ++/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit0\[ \\t\]+bar\\(%rip\\)" 1 } } */ ++/* { dg-final { scan-assembler-times "\[ \\t\]+prefetchit1\[ \\t\]+bar\\(%rip\\)" 1 } } */ ++ ++#include ++ ++int ++bar (int a) ++{ ++ return a + 1; ++} ++ ++int ++foo1 (int b) ++{ ++ _m_prefetchit0 (bar); ++ return bar (b) + 1; ++} ++ ++int ++foo2 (int b) ++{ ++ _m_prefetchit1 (bar); ++ return bar (b) + 1; ++} +-- +2.31.1 + diff --git a/INTEL-0033-i386-Fix-AVX512-intrin-macro-typo.patch b/INTEL-0033-i386-Fix-AVX512-intrin-macro-typo.patch new file mode 100644 index 0000000..d2a7072 --- /dev/null +++ b/INTEL-0033-i386-Fix-AVX512-intrin-macro-typo.patch @@ -0,0 +1,268 @@ +From c511b753a24ba48bbe4cdec5cf98e0f33cdb86ad Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Thu, 25 Jul 2024 16:12:20 +0800 +Subject: [PATCH 01/14] i386: Fix AVX512 intrin macro typo + +There are several typo in AVX512 intrins macro define. Correct them to solve +errors when compiled with -O0. + +gcc/ChangeLog: + + * config/i386/avx512dqintrin.h + (_mm_mask_fpclass_ss_mask): Correct operand order. + (_mm_mask_fpclass_sd_mask): Ditto. + (_mm256_maskz_reduce_round_ss): Use __builtin_ia32_reducess_mask_round + instead of __builtin_ia32_reducesd_mask_round. + (_mm_reduce_round_sd): Use -1 as mask since it is non-mask. + (_mm_reduce_round_ss): Ditto. + * config/i386/avx512vlbwintrin.h + (_mm256_mask_alignr_epi8): Correct operand usage. + (_mm_mask_alignr_epi8): Ditto. + * config/i386/avx512vlintrin.h (_mm_mask_alignr_epi64): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512bw-vpalignr-1b.c: New test. + * gcc.target/i386/avx512dq-vfpclasssd-1b.c: Ditto. + * gcc.target/i386/avx512dq-vfpclassss-1b.c: Ditto. + * gcc.target/i386/avx512dq-vreducesd-1b.c: Ditto. + * gcc.target/i386/avx512dq-vreducess-1b.c: Ditto. + * gcc.target/i386/avx512vl-valignq-1b.c: Ditto. + +(cherry picked from commit 16daeb262af4566e665a941368cb15bc2cba3f07) +--- + gcc/config/i386/avx512dqintrin.h | 16 +++++++++------- + gcc/config/i386/avx512vlbwintrin.h | 4 ++-- + gcc/config/i386/avx512vlintrin.h | 2 +- + .../gcc.target/i386/avx512bw-vpalignr-1b.c | 18 ++++++++++++++++++ + .../gcc.target/i386/avx512dq-vfpclasssd-1b.c | 14 ++++++++++++++ + .../gcc.target/i386/avx512dq-vfpclassss-1b.c | 14 ++++++++++++++ + .../gcc.target/i386/avx512dq-vreducesd-1b.c | 16 ++++++++++++++++ + .../gcc.target/i386/avx512dq-vreducess-1b.c | 16 ++++++++++++++++ + .../gcc.target/i386/avx512vl-valignq-1b.c | 15 +++++++++++++++ + 9 files changed, 105 insertions(+), 10 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c + create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c + +diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h +index e924250a4ad..4f9451e949b 100644 +--- a/gcc/config/i386/avx512dqintrin.h ++++ b/gcc/config/i386/avx512dqintrin.h +@@ -2800,11 +2800,11 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (-1))) \ + +-#define _mm_mask_fpclass_ss_mask(X, C, U) \ ++#define _mm_mask_fpclass_ss_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \ + (int) (C), (__mmask8) (U))) + +-#define _mm_mask_fpclass_sd_mask(X, C, U) \ ++#define _mm_mask_fpclass_sd_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (U))) + +@@ -2839,8 +2839,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U))) + + #define _mm_reduce_round_sd(A, B, C, R) \ +- ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \ +- (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) ++ ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ ++ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \ ++ (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ +@@ -2867,8 +2868,9 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U))) + + #define _mm_reduce_round_ss(A, B, C, R) \ +- ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \ +- (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) ++ ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ ++ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), \ ++ (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ +@@ -2876,7 +2878,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + (__mmask8)(U), (int)(R))) + + #define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ +- ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \ ++ ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (int)(R))) + +diff --git a/gcc/config/i386/avx512vlbwintrin.h b/gcc/config/i386/avx512vlbwintrin.h +index 192d54e743f..c918ed520c5 100644 +--- a/gcc/config/i386/avx512vlbwintrin.h ++++ b/gcc/config/i386/avx512vlbwintrin.h +@@ -1839,7 +1839,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) + #define _mm256_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)((N) * 8), \ +- (__v4di)(__m256i)(X), (__mmask32)(U))) ++ (__v4di)(__m256i)(W), (__mmask32)(U))) + + #define _mm256_mask_srli_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \ +@@ -1922,7 +1922,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) + #define _mm_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)((N) * 8), \ +- (__v2di)(__m128i)(X), (__mmask16)(U))) ++ (__v2di)(__m128i)(W), (__mmask16)(U))) + + #define _mm_maskz_alignr_epi8(U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ +diff --git a/gcc/config/i386/avx512vlintrin.h b/gcc/config/i386/avx512vlintrin.h +index 26b286eae6b..c6f3f35a009 100644 +--- a/gcc/config/i386/avx512vlintrin.h ++++ b/gcc/config/i386/avx512vlintrin.h +@@ -13609,7 +13609,7 @@ _mm256_permutex_pd (__m256d __X, const int __M) + + #define _mm_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ +- (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) ++ (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(W), (__mmask8)(U))) + + #define _mm_maskz_alignr_epi64(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c +new file mode 100644 +index 00000000000..2b42aa90b91 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-vpalignr-1b.c +@@ -0,0 +1,18 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx512bw -mavx512vl" } */ ++/* { dg-final { scan-assembler-times "vpalignr\[ \\t\]+\[^\{\n\]*%ymm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++/* { dg-final { scan-assembler-times "vpalignr\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m256i y; ++volatile __m128i x; ++volatile __mmask32 m2; ++volatile __mmask16 m3; ++ ++void extern ++avx512bw_test (void) ++{ ++ y = _mm256_mask_alignr_epi8 (y, m2, y, y, 10); ++ x = _mm_mask_alignr_epi8 (x, m3, x, x, 10); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c +new file mode 100644 +index 00000000000..8c7f96fb7a7 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclasssd-1b.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vfpclasssd\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[0-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m128d x128; ++volatile __mmask8 m8; ++ ++void extern ++avx512dq_test (void) ++{ ++ m8 = _mm_mask_fpclass_sd_mask (m8, x128, 13); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c +new file mode 100644 +index 00000000000..3196fd60d64 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vfpclassss-1b.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vfpclassss\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\[^\n^k\]*%k\[0-7\]\{%k\[0-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m128 x128; ++volatile __mmask8 m8; ++ ++void extern ++avx512dq_test (void) ++{ ++ m8 = _mm_mask_fpclass_ss_mask (m8, x128, 13); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c +new file mode 100644 +index 00000000000..9ae8259d373 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducesd-1b.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vreducesd\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++#define IMM 123 ++ ++volatile __m128d x1, x2, xx1, xx2; ++volatile __mmask8 m; ++ ++void extern ++avx512dq_test (void) ++{ ++ xx1 = _mm_reduce_round_sd (xx1, xx2, IMM, _MM_FROUND_NO_EXC); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c +new file mode 100644 +index 00000000000..47bf48fb617 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512dq-vreducess-1b.c +@@ -0,0 +1,16 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512dq -O0" } */ ++/* { dg-final { scan-assembler-times "vreducess\[ \\t\]+\[^\{\n\]*\{sae\}\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\[^\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++#define IMM 123 ++ ++volatile __m128 x1, x2, xx1, xx2; ++volatile __mmask8 m; ++ ++void extern ++avx512dq_test (void) ++{ ++ xx1 = _mm_reduce_round_ss (xx1, xx2, IMM, _MM_FROUND_NO_EXC); ++} +diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c b/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c +new file mode 100644 +index 00000000000..0ab16b27733 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512vl-valignq-1b.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O0 -mavx512vl" } */ ++/* { dg-final { scan-assembler-times "valignq\[ \\t\]+\[^\{\n\]*%xmm\[0-9\]+\{%k\[1-7\]\}(?:\n|\[ \\t\]+#)" 1 } } */ ++ ++#include ++ ++volatile __m256i y; ++volatile __m128i x; ++volatile __mmask8 m; ++ ++void extern ++avx512vl_test (void) ++{ ++ x = _mm_mask_alignr_epi64 (x, m, x, x, 1); ++} +-- +2.31.1 + diff --git a/INTEL-0034-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch b/INTEL-0034-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch new file mode 100644 index 0000000..0863492 --- /dev/null +++ b/INTEL-0034-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch @@ -0,0 +1,46 @@ +From 22584572ff9a1c3256da20f5438cacc6102fa2ac Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Mon, 29 Jul 2024 14:10:49 +0800 +Subject: [PATCH 02/14] i386: Use _mm_setzero_ps/d instead of + _mm_avx512_setzero_ps/d for GCC13/12 + +In GCC13/12, there is no _mm_avx512_setzero_ps/d since it is introduced +in GCC14. + +gcc/ChangeLog: + + * config/i386/avx512dqintrin.h (_mm_reduce_round_sd): Use + _mm_setzero_pd instead of _mm_avx512_setzero_pd. + (_mm_reduce_round_ss): Use _mm_setzero_ps instead of + _mm_avx512_setzero_ps. + +(cherry picked from commit 77ad22e4eaa97bb10068c6170f53caca77c99392) (gcc-12) +--- + gcc/config/i386/avx512dqintrin.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/gcc/config/i386/avx512dqintrin.h b/gcc/config/i386/avx512dqintrin.h +index 4f9451e949b..e8f8efe3be8 100644 +--- a/gcc/config/i386/avx512dqintrin.h ++++ b/gcc/config/i386/avx512dqintrin.h +@@ -2840,7 +2840,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + + #define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ +- (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_avx512_setzero_pd (), \ ++ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ +@@ -2869,7 +2869,7 @@ _mm512_fpclass_ps_mask (__m512 __A, const int __imm) + + #define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ +- (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_avx512_setzero_ps (), \ ++ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(-1), (int)(R))) + + #define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ +-- +2.31.1 + diff --git a/INTEL-0035-Refine-constraint-Bk-to-define_special_memory_constr.patch b/INTEL-0035-Refine-constraint-Bk-to-define_special_memory_constr.patch new file mode 100644 index 0000000..024209e --- /dev/null +++ b/INTEL-0035-Refine-constraint-Bk-to-define_special_memory_constr.patch @@ -0,0 +1,107 @@ +From bdc11c30981f8954249aa534c9b5b2ea51efa042 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 24 Jul 2024 11:29:23 +0800 +Subject: [PATCH 03/14] Refine constraint "Bk" to + define_special_memory_constraint. + +For below pattern, RA may still allocate r162 as v/k register, try to +reload for address with leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rsi +which result a linker error. + +(set (reg:DI 162) + (mem/u/c:DI + (const:DI (unspec:DI + [(symbol_ref:DI ("a") [flags 0x60] )] + UNSPEC_GOTNTPOFF)) + +Quote from H.J for why linker issue an error. +>What do these do: +> +> leaq __libc_tsd_CTYPE_B@gottpoff(%rip), %rax +> vmovq (%rax), %xmm0 +> +>From x86-64 TLS psABI: +> +>The assembler generates for the x@gottpoff(%rip) expressions a R X86 +>64 GOTTPOFF relocation for the symbol x which requests the linker to +>generate a GOT entry with a R X86 64 TPOFF64 relocation. The offset of +>the GOT entry relative to the end of the instruction is then used in +>the instruction. The R X86 64 TPOFF64 relocation is pro- cessed at +>program startup time by the dynamic linker by looking up the symbol x +>in the modules loaded at that point. The offset is written in the GOT +>entry and later loaded by the addq instruction. +> +>The above code sequence looks wrong to me. + +gcc/ChangeLog: + + PR target/116043 + * config/i386/constraints.md (Bk): Refine to + define_special_memory_constraint. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr116043.c: New test. + +(cherry picked from commit bc1fda00d5f20e2f3e77a50b2822562b6e0040b2) +--- + gcc/config/i386/constraints.md | 2 +- + gcc/testsuite/gcc.target/i386/pr116043.c | 33 ++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 1 deletion(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr116043.c + +diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md +index 7361687632f..e4b66340589 100644 +--- a/gcc/config/i386/constraints.md ++++ b/gcc/config/i386/constraints.md +@@ -187,7 +187,7 @@ + (and (match_operand 0 "memory_operand") + (match_test "constant_address_p (XEXP (op, 0))"))) + +-(define_memory_constraint "Bk" ++(define_special_memory_constraint "Bk" + "@internal TLS address that allows insn using non-integer registers." + (and (match_operand 0 "memory_operand") + (not (match_test "ix86_gpr_tls_address_pattern_p (op)")))) +diff --git a/gcc/testsuite/gcc.target/i386/pr116043.c b/gcc/testsuite/gcc.target/i386/pr116043.c +new file mode 100644 +index 00000000000..76553496c10 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr116043.c +@@ -0,0 +1,33 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512bf16 -O3" } */ ++/* { dg-final { scan-assembler-not {(?n)lea.*@gottpoff} } } */ ++ ++extern __thread int a, c, i, j, k, l; ++int *b; ++struct d { ++ int e; ++} f, g; ++char *h; ++ ++void m(struct d *n) { ++ b = &k; ++ for (; n->e; b++, n--) { ++ i = b && a; ++ if (i) ++ j = c; ++ } ++} ++ ++char *o(struct d *n) { ++ for (; n->e;) ++ return h; ++} ++ ++int q() { ++ if (l) ++ return 1; ++ int p = *o(&g); ++ m(&f); ++ m(&g); ++ l = p; ++} +-- +2.31.1 + diff --git a/INTEL-0036-Align-ix86_-move_max-store_max-with-vectorizer.patch b/INTEL-0036-Align-ix86_-move_max-store_max-with-vectorizer.patch new file mode 100644 index 0000000..36876a3 --- /dev/null +++ b/INTEL-0036-Align-ix86_-move_max-store_max-with-vectorizer.patch @@ -0,0 +1,232 @@ +From 002e45c7f46a0f8dd2b5381cd1ee1341f8987fca Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 15 Aug 2024 12:54:07 +0800 +Subject: [PATCH 04/14] Align ix86_{move_max,store_max} with vectorizer. + +When none of mprefer-vector-width, avx256_optimal/avx128_optimal, +avx256_store_by_pieces/avx512_store_by_pieces is specified, GCC will +set ix86_{move_max,store_max} as max available vector length except +for AVX part. + + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags) + && TARGET_EVEX512_P (opts->x_ix86_isa_flags2)) + opts->x_ix86_move_max = PVW_AVX512; + else + opts->x_ix86_move_max = PVW_AVX128; + +So for -mavx2, vectorizer will choose 256-bit for vectorization, but +128-bit is used for struct copy, there could be a potential STLF issue +due to this "misalign". + +The patch fixes that. + +gcc/ChangeLog: + + * config/i386/i386-options.cc (ix86_option_override_internal): + set ix86_{move_max,store_max} to PVW_AVX256 when TARGET_AVX + instead of PVW_AVX128. + +gcc/testsuite/ChangeLog: + * gcc.target/i386/pieces-memcpy-10.c: Add -mprefer-vector-width=128. + * gcc.target/i386/pieces-memcpy-6.c: Ditto. + * gcc.target/i386/pieces-memset-38.c: Ditto. + * gcc.target/i386/pieces-memset-40.c: Ditto. + * gcc.target/i386/pieces-memset-41.c: Ditto. + * gcc.target/i386/pieces-memset-42.c: Ditto. + * gcc.target/i386/pieces-memset-43.c: Ditto. + * gcc.target/i386/pieces-strcpy-2.c: Ditto. + * gcc.target/i386/pieces-memcpy-22.c: New test. + * gcc.target/i386/pieces-memset-51.c: New test. + * gcc.target/i386/pieces-strcpy-3.c: New test. + +(cherry picked from commit aea374238cec1a1e53fb79575d2f998e16926999) +--- + gcc/config/i386/i386-options.cc | 6 ++++++ + gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c | 12 ++++++++++++ + gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-51.c | 12 ++++++++++++ + gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c | 15 +++++++++++++++ + 12 files changed, 53 insertions(+), 8 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-memset-51.c + create mode 100644 gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c + +diff --git a/gcc/config/i386/i386-options.cc b/gcc/config/i386/i386-options.cc +index 991661fe4a2..061a1584318 100644 +--- a/gcc/config/i386/i386-options.cc ++++ b/gcc/config/i386/i386-options.cc +@@ -2802,6 +2802,9 @@ ix86_option_override_internal (bool main_args_p, + { + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + opts->x_ix86_move_max = PVW_AVX512; ++ /* Align with vectorizer to avoid potential STLF issue. */ ++ else if (TARGET_AVX_P (opts->x_ix86_isa_flags)) ++ opts->x_ix86_move_max = PVW_AVX256; + else + opts->x_ix86_move_max = PVW_AVX128; + } +@@ -2823,6 +2826,9 @@ ix86_option_override_internal (bool main_args_p, + { + if (TARGET_AVX512F_P (opts->x_ix86_isa_flags)) + opts->x_ix86_store_max = PVW_AVX512; ++ /* Align with vectorizer to avoid potential STLF issue. */ ++ else if (TARGET_AVX_P (opts->x_ix86_isa_flags)) ++ opts->x_ix86_store_max = PVW_AVX256; + else + opts->x_ix86_store_max = PVW_AVX128; + } +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +index 5faee21f9b9..53ad0b3be44 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c +new file mode 100644 +index 00000000000..605b3623ffc +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-22.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *dst, *src; ++ ++void ++foo (void) ++{ ++ __builtin_memcpy (dst, src, 33); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +index 5f99cc98c47..cfd2a86cf33 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +index ed4a24a54fd..ddd194debd5 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +index 4eda73ead59..9c206465d46 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +index 93df8101e4d..b0756182e35 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge -mno-stackrealign" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge -mno-stackrealign" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +index df0c122aae7..103da699ae5 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +index 2f2179c2df9..f1494e17610 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-51.c b/gcc/testsuite/gcc.target/i386/pieces-memset-51.c +new file mode 100644 +index 00000000000..192ec0d1647 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-51.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *dst; ++ ++void ++foo (int x) ++{ ++ __builtin_memset (dst, x, 64); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +index 90446edb4f3..9bb94b7419b 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ + + extern char *strcpy (char *, const char *); + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c +new file mode 100644 +index 00000000000..df7571b547f +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-3.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mtune=generic" } */ ++ ++extern char *strcpy (char *, const char *); ++ ++void ++foo (char *s) ++{ ++ strcpy (s, ++ "1234567890abcdef123456abcdef5678123456abcdef567abcdef678" ++ "1234567"); ++} ++ ++/* { dg-final { scan-assembler-times "vmovdqa\[ \\t\]+\[^\n\]*%ymm" 2 } } */ ++/* { dg-final { scan-assembler-times "vmovdqu\[ \\t\]+\[^\n\]*%ymm" 2 } } */ +-- +2.31.1 + diff --git a/INTEL-0037-Fix-testcase-failure.patch b/INTEL-0037-Fix-testcase-failure.patch new file mode 100644 index 0000000..a1d3ed6 --- /dev/null +++ b/INTEL-0037-Fix-testcase-failure.patch @@ -0,0 +1,120 @@ +From c5c6183ab3132d40fb0f10c57c26c6ef4f69bfda Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 22 Aug 2024 14:31:40 +0800 +Subject: [PATCH 05/14] Fix testcase failure. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pieces-memcpy-10.c: Use -mmove-max=256 and + -mstore-max=256. + * gcc.target/i386/pieces-memcpy-6.c: Ditto. + * gcc.target/i386/pieces-memset-38.c: Ditto. + * gcc.target/i386/pieces-memset-40.c: Ditto. + * gcc.target/i386/pieces-memset-41.c: Ditto. + * gcc.target/i386/pieces-memset-42.c: Ditto. + * gcc.target/i386/pieces-memset-43.c: Ditto. + * gcc.target/i386/pieces-strcpy-2.c: Ditto. + +(cherry picked from commit ea9c508927ec032c6d67a24df59ffa429e4d3d95) +--- + gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-38.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-40.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-41.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-42.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-memset-43.c | 2 +- + gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c | 2 +- + 8 files changed, 8 insertions(+), 8 deletions(-) + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +index 53ad0b3be44..78f92ac5197 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-10.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +index cfd2a86cf33..57b74ae4b23 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memcpy-6.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst, *src; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +index ddd194debd5..d9443678735 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-38.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-38.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +index 9c206465d46..8ad6ad7e494 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-40.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-40.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx512f -mavx2 -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx512f -mavx2 -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +index b0756182e35..08fd6e9a927 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-41.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-41.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge -mno-stackrealign" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge -mno-stackrealign" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +index 103da699ae5..6b73bb256af 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-42.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-42.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +index f1494e17610..c6c7ff234da 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-memset-43.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-memset-43.c +@@ -1,5 +1,5 @@ + /* { dg-do compile } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *dst; + +diff --git a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +index 9bb94b7419b..40ada119625 100644 +--- a/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c ++++ b/gcc/testsuite/gcc.target/i386/pieces-strcpy-2.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target { ! ia32 } } } */ +-/* { dg-options "-O2 -mno-avx2 -mavx -mprefer-vector-width=128 -mtune=sandybridge" } */ ++/* { dg-options "-O2 -mno-avx2 -mavx -mmove-max=128 -mstore-max=128 -mtune=sandybridge" } */ + + extern char *strcpy (char *, const char *); + +-- +2.31.1 + diff --git a/INTEL-0038-Check-avx-upper-register-for-parallel.patch b/INTEL-0038-Check-avx-upper-register-for-parallel.patch new file mode 100644 index 0000000..6112e87 --- /dev/null +++ b/INTEL-0038-Check-avx-upper-register-for-parallel.patch @@ -0,0 +1,148 @@ +From 8d7562fe6bd0284dc15cae8f1cd1b59ee940064a Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 29 Aug 2024 11:39:20 +0800 +Subject: [PATCH 06/14] Check avx upper register for parallel. + +For function arguments/return, when it's BLK mode, it's put in a +parallel with an expr_list, and the expr_list contains the real mode +and registers. +Current ix86_check_avx_upper_register only checked for SSE_REG_P, and +failed to handle that. The patch extend the handle to each subrtx. + +gcc/ChangeLog: + + PR target/116512 + * config/i386/i386.cc (ix86_check_avx_upper_register): Iterate + subrtx to scan for avx upper register. + (ix86_check_avx_upper_stores): Inline old + ix86_check_avx_upper_register. + (ix86_avx_u128_mode_needed): Ditto, and replace + FOR_EACH_SUBRTX with call to new + ix86_check_avx_upper_register. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr116512.c: New test. + +(cherry picked from commit ab214ef734bfc3dcffcf79ff9e1dd651c2b40566) +--- + gcc/config/i386/i386.cc | 36 +++++++++++++++--------- + gcc/testsuite/gcc.target/i386/pr116512.c | 26 +++++++++++++++++ + 2 files changed, 49 insertions(+), 13 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr116512.c + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index ade965927ac..e2743e0bd5c 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -14359,9 +14359,19 @@ ix86_dirflag_mode_needed (rtx_insn *insn) + static bool + ix86_check_avx_upper_register (const_rtx exp) + { +- return (SSE_REG_P (exp) +- && !EXT_REX_SSE_REG_P (exp) +- && GET_MODE_BITSIZE (GET_MODE (exp)) > 128); ++ /* construct_container may return a parallel with expr_list ++ which contains the real reg and mode */ ++ subrtx_iterator::array_type array; ++ FOR_EACH_SUBRTX (iter, array, exp, NONCONST) ++ { ++ const_rtx x = *iter; ++ if (SSE_REG_P (x) ++ && !EXT_REX_SSE_REG_P (x) ++ && GET_MODE_BITSIZE (GET_MODE (x)) > 128) ++ return true; ++ } ++ ++ return false; + } + + /* Check if a 256bit or 512bit AVX register is referenced in stores. */ +@@ -14369,7 +14379,9 @@ ix86_check_avx_upper_register (const_rtx exp) + static void + ix86_check_avx_upper_stores (rtx dest, const_rtx, void *data) + { +- if (ix86_check_avx_upper_register (dest)) ++ if (SSE_REG_P (dest) ++ && !EXT_REX_SSE_REG_P (dest) ++ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) + { + bool *used = (bool *) data; + *used = true; +@@ -14427,14 +14439,14 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + return AVX_U128_CLEAN; + } + +- subrtx_iterator::array_type array; +- + rtx set = single_set (insn); + if (set) + { + rtx dest = SET_DEST (set); + rtx src = SET_SRC (set); +- if (ix86_check_avx_upper_register (dest)) ++ if (SSE_REG_P (dest) ++ && !EXT_REX_SSE_REG_P (dest) ++ && GET_MODE_BITSIZE (GET_MODE (dest)) > 128) + { + /* This is an YMM/ZMM load. Return AVX_U128_DIRTY if the + source isn't zero. */ +@@ -14445,9 +14457,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + } + else + { +- FOR_EACH_SUBRTX (iter, array, src, NONCONST) +- if (ix86_check_avx_upper_register (*iter)) +- return AVX_U128_DIRTY; ++ if (ix86_check_avx_upper_register (src)) ++ return AVX_U128_DIRTY; + } + + /* This isn't YMM/ZMM load/store. */ +@@ -14458,9 +14469,8 @@ ix86_avx_u128_mode_needed (rtx_insn *insn) + Hardware changes state only when a 256bit register is written to, + but we need to prevent the compiler from moving optimal insertion + point above eventual read from 256bit or 512 bit register. */ +- FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) +- if (ix86_check_avx_upper_register (*iter)) +- return AVX_U128_DIRTY; ++ if (ix86_check_avx_upper_register (PATTERN (insn))) ++ return AVX_U128_DIRTY; + + return AVX_U128_ANY; + } +diff --git a/gcc/testsuite/gcc.target/i386/pr116512.c b/gcc/testsuite/gcc.target/i386/pr116512.c +new file mode 100644 +index 00000000000..c2bc6c91b64 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr116512.c +@@ -0,0 +1,26 @@ ++/* { dg-do compile } */ ++/* { dg-options "-march=x86-64-v4 -O2" } */ ++/* { dg-final { scan-assembler-not "vzeroupper" { target { ! ia32 } } } } */ ++ ++#include ++ ++struct B { ++ union { ++ __m512 f; ++ __m512i s; ++ }; ++}; ++ ++struct B foo(int n) { ++ struct B res; ++ res.s = _mm512_set1_epi32(n); ++ ++ return res; ++} ++ ++__m512i bar(int n) { ++ struct B res; ++ res.s = _mm512_set1_epi32(n); ++ ++ return res.s; ++} +-- +2.31.1 + diff --git a/INTEL-0039-i386-Fix-vfpclassph-non-optimizied-intrin.patch b/INTEL-0039-i386-Fix-vfpclassph-non-optimizied-intrin.patch new file mode 100644 index 0000000..77a60bb --- /dev/null +++ b/INTEL-0039-i386-Fix-vfpclassph-non-optimizied-intrin.patch @@ -0,0 +1,134 @@ +From 9cb8d824a580c1ea79718300deed14b8ec5cc1e2 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Mon, 2 Sep 2024 15:00:22 +0800 +Subject: [PATCH 07/14] i386: Fix vfpclassph non-optimizied intrin + +The intrin for non-optimized got a typo in mask type, which will cause +the high bits of __mmask32 being unexpectedly zeroed. + +The test does not fail under O0 with current 1b since the testcase is +wrong. We need to include avx512-mask-type.h after SIZE is defined, or +it will always be __mmask8. That problem also happened in AVX10.2 testcases. +I will write a seperate patch to fix that. + +gcc/ChangeLog: + + * config/i386/avx512fp16intrin.h + (_mm512_mask_fpclass_ph_mask): Correct mask type to __mmask32. + (_mm512_fpclass_ph_mask): Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512fp16-vfpclassph-1c.c: New test. + +(cherry picked from commit 6e59b188c4a051d4f2de5220d30681e6963d96c0) (gcc-12) +--- + gcc/config/i386/avx512fp16intrin.h | 4 +- + .../i386/avx512fp16-vfpclassph-1c.c | 77 +++++++++++++++++++ + 2 files changed, 79 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c + +diff --git a/gcc/config/i386/avx512fp16intrin.h b/gcc/config/i386/avx512fp16intrin.h +index b16ccfcb7f1..6330e57ebb8 100644 +--- a/gcc/config/i386/avx512fp16intrin.h ++++ b/gcc/config/i386/avx512fp16intrin.h +@@ -2321,11 +2321,11 @@ _mm512_fpclass_ph_mask (__m512h __A, const int __imm) + #else + #define _mm512_mask_fpclass_ph_mask(u, x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ +- (int) (c),(__mmask8)(u))) ++ (int) (c),(__mmask32)(u))) + + #define _mm512_fpclass_ph_mask(x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ +- (int) (c),(__mmask8)-1)) ++ (int) (c),(__mmask32)-1)) + #endif /* __OPIMTIZE__ */ + + /* Intrinsics vgetexpph, vgetexpsh. */ +diff --git a/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c +new file mode 100644 +index 00000000000..4739f1228e3 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/avx512fp16-vfpclassph-1c.c +@@ -0,0 +1,77 @@ ++/* { dg-do run } */ ++/* { dg-options "-O0 -mavx512fp16" } */ ++/* { dg-require-effective-target avx512fp16 } */ ++ ++#define AVX512FP16 ++#include "avx512f-helper.h" ++ ++#include ++#include ++#include ++#define SIZE (AVX512F_LEN / 16) ++#include "avx512f-mask-type.h" ++ ++#ifndef __FPCLASSPH__ ++#define __FPCLASSPH__ ++int check_fp_class_hp (_Float16 src, int imm) ++{ ++ int qNaN_res = isnan (src); ++ int sNaN_res = isnan (src); ++ int Pzero_res = (src == 0.0); ++ int Nzero_res = (src == -0.0); ++ int PInf_res = (isinf (src) == 1); ++ int NInf_res = (isinf (src) == -1); ++ int Denorm_res = (fpclassify (src) == FP_SUBNORMAL); ++ int FinNeg_res = __builtin_finite (src) && (src < 0); ++ ++ int result = (((imm & 1) && qNaN_res) ++ || (((imm >> 1) & 1) && Pzero_res) ++ || (((imm >> 2) & 1) && Nzero_res) ++ || (((imm >> 3) & 1) && PInf_res) ++ || (((imm >> 4) & 1) && NInf_res) ++ || (((imm >> 5) & 1) && Denorm_res) ++ || (((imm >> 6) & 1) && FinNeg_res) ++ || (((imm >> 7) & 1) && sNaN_res)); ++ return result; ++} ++#endif ++ ++MASK_TYPE ++CALC (_Float16 *s1, int imm) ++{ ++ int i; ++ MASK_TYPE res = 0; ++ ++ for (i = 0; i < SIZE; i++) ++ if (check_fp_class_hp(s1[i], imm)) ++ res = res | (1 << i); ++ ++ return res; ++} ++ ++void ++TEST (void) ++{ ++ int i; ++ UNION_TYPE (AVX512F_LEN, h) src; ++ MASK_TYPE res1, res2, res_ref = 0; ++ MASK_TYPE mask = MASK_VALUE; ++ ++ src.a[SIZE - 1] = NAN; ++ src.a[SIZE - 2] = 1.0 / 0.0; ++ for (i = 0; i < SIZE - 2; i++) ++ { ++ src.a[i] = -24.43 + 0.6 * i; ++ } ++ ++ res1 = INTRINSIC (_fpclass_ph_mask) (src.x, 0xFF); ++ res2 = INTRINSIC (_mask_fpclass_ph_mask) (mask, src.x, 0xFF); ++ ++ res_ref = CALC (src.a, 0xFF); ++ ++ if (res_ref != res1) ++ abort (); ++ ++ if ((mask & res_ref) != res2) ++ abort (); ++} +-- +2.31.1 + diff --git a/INTEL-0040-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch b/INTEL-0040-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch new file mode 100644 index 0000000..b691700 --- /dev/null +++ b/INTEL-0040-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch @@ -0,0 +1,278 @@ +From 3bdecde26f8edffbd0e981c280e3fb8519709ea4 Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Wed, 18 Sep 2024 11:20:15 +0800 +Subject: [PATCH 08/14] doc: Add more alias option and reorder Intel CPU -march + documentation + +This patch is backported from GCC15 with some tweaks. + +Since r15-3539, there are requests coming in to add other alias option +documentation. This patch will add all of them, including corei7, corei7-avx, +core-avx-i, core-avx2, atom and slm. + +Also in the patch, I reordered that part of documentation, currently all +the CPUs/products are just all over the place. I regrouped them by +date-to-now products (since the very first CPU to latest Panther Lake), P-core +(since the clients become hybrid cores, starting from Sapphire Rapids) and +E-core (since Bonnell). In GCC14 and eariler GCC, Xeon Phi CPUs are still +there, I put them after E-core CPUs. + +And in the patch, I refined the product names in documentation. + +gcc/ChangeLog: + + * doc/invoke.texi: Add corei7, corei7-avx, core-avx-i, + core-avx2, atom, and slm. Reorder the -march documentation by + splitting them into date-to-now products, P-core, E-core and + Xeon Phi. Refine the product names in documentation. + +(cherry picked from commit 8483527158024d200b3a9e4edecbe188fa22fdaa) +--- + gcc/doc/invoke.texi | 162 +++++++++++++++++++++++--------------------- + 1 file changed, 84 insertions(+), 78 deletions(-) + +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 109858f7666..90073ac9832 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -31441,6 +31441,7 @@ Intel Core 2 CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, CX16, + SAHF and FXSR instruction set support. + + @item nehalem ++@itemx corei7 + Intel Nehalem CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF and FXSR instruction set support. + +@@ -31449,17 +31450,20 @@ Intel Westmere CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR and PCLMUL instruction set support. + + @item sandybridge ++@itemx corei7-avx + Intel Sandy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE and PCLMUL instruction set + support. + + @item ivybridge ++@itemx core-avx-i + Intel Ivy Bridge CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND + and F16C instruction set support. + + @item haswell +-Intel Haswell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@itemx core-avx2 ++Intel Haswell CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE and HLE instruction set support. + +@@ -31475,47 +31479,6 @@ SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, + CLFLUSHOPT, XSAVEC, XSAVES and SGX instruction set support. + +-@item bonnell +-Intel Bonnell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3 and SSSE3 +-instruction set support. +- +-@item silvermont +-Intel Silvermont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW and RDRND +-instruction set support. +- +-@item goldmont +-Intel Goldmont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, +-RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT and FSGSBASE instruction +-set support. +- +-@item goldmont-plus +-Intel Goldmont Plus CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, +-SHA, RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, +-RDPID and SGX instruction set support. +- +-@item tremont +-Intel Tremont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, +-RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, RDPID, +-SGX, CLWB, GFNI-SSE, MOVDIRI, MOVDIR64B, CLDEMOTE and WAITPKG instruction set +-support. +- +-@item knl +-Intel Knight's Landing CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1 instruction set support. +- +-@item knm +-Intel Knights Mill CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1, AVX5124VNNIW, +-AVX5124FMAPS and AVX512VPOPCNTDQ instruction set support. +- + @item skylake-avx512 + Intel Skylake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +@@ -31523,16 +31486,30 @@ RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, + AVX512DQ and AVX512CD instruction set support. + ++@item cascadelake ++Intel Cascade Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD and AVX512VNNI instruction set support. ++ + @item cannonlake +-Intel Cannonlake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, ++Intel Cannon Lake Server CPU with 64-bit extensions, MMX, SSE, SSE2, + SSE3, SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, + FSGSBASE, RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, + PREFETCHW, AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, + AVX512DQ, AVX512CD, PKU, AVX512VBMI, AVX512IFMA and SHA instruction set + support. + ++@item cooperlake ++Intel Cooper Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD, AVX512VNNI and AVX512BF16 instruction set support. ++ + @item icelake-client +-Intel Icelake Client CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++Intel Ice Lake Client CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, + RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, +@@ -31540,7 +31517,7 @@ AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2 + , VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. + + @item icelake-server +-Intel Icelake Server CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++Intel Ice Lake Server CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, + SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, + RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, + AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, +@@ -31548,55 +31525,84 @@ AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2 + , VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD and CLWB + instruction set support. + +-@item cascadelake +-Intel Cascadelake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, +-F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, +-AVX512CD and AVX512VNNI instruction set support. +- +-@item cooperlake +-Intel cooperlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@item tigerlake ++Intel Tiger Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, CLWB, AVX512VL, AVX512BW, AVX512DQ, +-AVX512CD, AVX512VNNI and AVX512BF16 instruction set support. ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, ++AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, ++VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, MOVDIRI, MOVDIR64B, CLWB, ++AVX512VP2INTERSECT and KEYLOCKER instruction set support. + +-@item tigerlake +-Intel Tigerlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++@item rocketlake ++Intel Rocket Lake CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, + F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD ++CLFLUSHOPT, XSAVEC, XSAVES, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD, + PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, +-VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, MOVDIRI, MOVDIR64B, CLWB, +-AVX512VP2INTERSECT and KEYLOCKER instruction set support. ++VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. ++ ++@item alderlake ++Intel Alder Lake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES, ++XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B, ++CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, ++VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set ++support. + + @item sapphirerapids +-Intel sapphirerapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, +-RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, +-AES, CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, ++Intel Sapphire Rapids CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, ++F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, ++CLFLUSHOPT, XSAVEC, XSAVES, SGX, AVX512F, AVX512VL, AVX512BW, AVX512DQ, + AVX512CD, PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, + VPCLMULQDQ, AVX512BITALG, RDPID, AVX512VPOPCNTDQ, PCONFIG, WBNOINVD, CLWB, + MOVDIRI, MOVDIR64B, ENQCMD, CLDEMOTE, PTWRITE, WAITPKG, SERIALIZE, TSXLDTRK, + UINTR, AMX-BF16, AMX-TILE, AMX-INT8, AVX-VNNI, AVX512-FP16 and AVX512BF16 + instruction set support. + +-@item alderlake +-Intel Alderlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, +-SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES, +-XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B, +-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, +-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set ++@item bonnell ++@itemx atom ++Intel Bonnell CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3 and SSSE3 ++instruction set support. ++ ++@item silvermont ++@itemx slm ++Intel Silvermont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW and RDRND ++instruction set support. ++ ++@item goldmont ++Intel Goldmont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, ++RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT and FSGSBASE instruction ++set support. ++ ++@item goldmont-plus ++Intel Goldmont Plus CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, ++SHA, RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, ++RDPID and SGX instruction set support. ++ ++@item tremont ++Intel Tremont CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, ++SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, PCLMUL, PREFETCHW, RDRND, AES, SHA, ++RDSEED, XSAVE, XSAVEC, XSAVES, XSAVEOPT, CLFLUSHOPT, FSGSBASE, PTWRITE, RDPID, ++SGX, CLWB, GFNI-SSE, MOVDIRI, MOVDIR64B, CLDEMOTE and WAITPKG instruction set + support. + +-@item rocketlake +-Intel Rocketlake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3 +-, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, RDRND, +-F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, AES, +-CLFLUSHOPT, XSAVEC, XSAVES, AVX512F, AVX512VL, AVX512BW, AVX512DQ, AVX512CD +-PKU, AVX512VBMI, AVX512IFMA, SHA, AVX512VNNI, GFNI, VAES, AVX512VBMI2, +-VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. ++@item knl ++Intel Knights Landing CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, ++RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, ++AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1 instruction set support. ++ ++@item knm ++Intel Knights Mill CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, ++SSSE3, SSE4.1, SSE4.2, POPCNT, CX16, SAHF, FXSR, AVX, XSAVE, PCLMUL, FSGSBASE, ++RDRND, F16C, AVX2, BMI, BMI2, LZCNT, FMA, MOVBE, HLE, RDSEED, ADCX, PREFETCHW, ++AVX512PF, AVX512ER, AVX512F, AVX512CD and PREFETCHWT1, AVX5124VNNIW, ++AVX5124FMAPS and AVX512VPOPCNTDQ instruction set support. + + @item graniterapids + Intel graniterapids CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, +-- +2.31.1 + diff --git a/INTEL-0041-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch b/INTEL-0041-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch new file mode 100644 index 0000000..ede3e8f --- /dev/null +++ b/INTEL-0041-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch @@ -0,0 +1,416 @@ +From 45b9ba8abc0379b5f83e9209325f9c9a31faec8e Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Wed, 16 Oct 2024 13:43:48 +0800 +Subject: [PATCH 09/14] Refine splitters related to "combine vpcmpuw + + zero_extend to vpcmpuw" + +r12-6103-g1a7ce8570997eb combines vpcmpuw + zero_extend to vpcmpuw +with the pre_reload splitter, but the splitter transforms the +zero_extend into a subreg which make reload think the upper part is +garbage, it's not correct. + +The patch adjusts the zero_extend define_insn_and_split to +define_insn to keep zero_extend. + +gcc/ChangeLog: + + PR target/117159 + * config/i386/sse.md + (*_cmp3_zero_extend): + Change from define_insn_and_split to define_insn. + (*_cmp3_zero_extend): + Ditto. + (*_ucmp3_zero_extend): + Ditto. + (*_ucmp3_zero_extend): + Ditto. + (*_cmp3_zero_extend_2): + Split to the zero_extend pattern. + (*_cmp3_zero_extend_2): + Ditto. + (*_ucmp3_zero_extend_2): + Ditto. + (*_ucmp3_zero_extend_2): + Ditto. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117159.c: New test. + * gcc.target/i386/avx512bw-pr103750-1.c: Remove xfail. + * gcc.target/i386/avx512bw-pr103750-2.c: Remove xfail. + +(cherry picked from commit 5259d3927c1c8e3a15b4b844adef59b48c241233) +--- + gcc/config/i386/sse.md | 196 +++++++----------- + .../gcc.target/i386/avx512bw-pr103750-1.c | 3 +- + .../gcc.target/i386/avx512bw-pr103750-2.c | 3 +- + gcc/testsuite/gcc.target/i386/pr117159.c | 42 ++++ + 4 files changed, 124 insertions(+), 120 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117159.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 23b858ab21c..7d01c00a848 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -3724,32 +3724,19 @@ + + ;; Since vpcmpd implicitly clear the upper bits of dest, transform + ;; vpcmpd + zero_extend to vpcmpd since the instruction +-(define_insn_and_split "*_cmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_cmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand") +- (match_operand:V48H_AVX512VL 2 "nonimmediate_operand") ++ [(match_operand:V48H_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:V48H_AVX512VL 2 "nonimmediate_operand" "vm") + (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512F + && (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW) +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3777,21 +3764,22 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP)) +- (set (match_dup 4) (match_dup 0))] ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_PCMP))) ++ (set (match_dup 4) (match_dup 5))] + { +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); +-} +- [(set_attr "type" "ssecmp") +- (set_attr "length_immediate" "1") +- (set_attr "prefix" "evex") +- (set_attr "mode" "")]) ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } ++}) + + (define_insn_and_split "*_cmp3" + [(set (match_operand: 0 "register_operand") +@@ -3826,31 +3814,18 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_cmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_cmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_PCMP)))] + "TARGET_AVX512BW +- && ix86_pre_reload_split () +- && (GET_MODE_NUNITS (mode) +- < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ && (GET_MODE_NUNITS (mode) ++ < GET_MODE_PRECISION (mode))" ++ "vpcmp\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3877,16 +3852,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_PCMP)) +- (set (match_dup 4) (match_dup 0))] ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_PCMP))) ++ (set (match_dup 4) (match_dup 5))] + { +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +@@ -3945,31 +3925,18 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_ucmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_ucmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI12_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI12_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI12_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512BW +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -3997,16 +3964,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP)) +- (set (match_dup 4) (match_dup 0))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_UNSIGNED_PCMP))) ++ (set (match_dup 4) (match_dup 5))] ++{ ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +@@ -4043,32 +4015,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "")]) + +-(define_insn_and_split "*_ucmp3_zero_extend" +- [(set (match_operand:SWI248x 0 "register_operand") ++(define_insn "*_ucmp3_zero_extend" ++ [(set (match_operand:SWI248x 0 "register_operand" "=k") + (zero_extend:SWI248x + (unspec: +- [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand") +- (match_operand:VI48_AVX512VL 2 "nonimmediate_operand") +- (match_operand:SI 3 "const_0_to_7_operand")] ++ [(match_operand:VI48_AVX512VL 1 "nonimmediate_operand" "v") ++ (match_operand:VI48_AVX512VL 2 "nonimmediate_operand" "vm") ++ (match_operand:SI 3 "const_0_to_7_operand" "n")] + UNSPEC_UNSIGNED_PCMP)))] + "TARGET_AVX512F + && (!VALID_MASK_AVX512BW_MODE (mode) || TARGET_AVX512BW) +- && ix86_pre_reload_split () + && (GET_MODE_NUNITS (mode) + < GET_MODE_PRECISION (mode))" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, +- operands[0], mode); +-} ++ "vpcmpu\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") + (set_attr "prefix" "evex") +@@ -4096,16 +4055,21 @@ + "#" + "&& 1" + [(set (match_dup 0) +- (unspec: +- [(match_dup 1) +- (match_dup 2) +- (match_dup 3)] +- UNSPEC_UNSIGNED_PCMP)) +- (set (match_dup 4) (match_dup 0))] +-{ +- operands[1] = force_reg (mode, operands[1]); +- operands[0] = lowpart_subreg (mode, ++ (zero_extend:SWI248x ++ (unspec: ++ [(match_dup 1) ++ (match_dup 2) ++ (match_dup 3)] ++ UNSPEC_UNSIGNED_PCMP))) ++ (set (match_dup 4) (match_dup 5))] ++{ ++ operands[5] = lowpart_subreg (mode, + operands[0], mode); ++ if (SUBREG_P (operands[5])) ++ { ++ SUBREG_PROMOTED_VAR_P (operands[5]) = 1; ++ SUBREG_PROMOTED_SET (operands[5], 1); ++ } + } + [(set_attr "type" "ssecmp") + (set_attr "length_immediate" "1") +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c +index b1165f069bb..e7d6183232b 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-1.c +@@ -1,8 +1,7 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ +-/* xfail need to be fixed. */ ++/* { dg-final { scan-assembler-not "kmov" } } */ + + #include + extern __m128i* pi128; +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +index 7303f5403ba..3392e193222 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +@@ -1,8 +1,7 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ +-/* xfail need to be fixed. */ ++/* { dg-final { scan-assembler-not "kmov" } } */ + + #include + extern __m128i* pi128; +diff --git a/gcc/testsuite/gcc.target/i386/pr117159.c b/gcc/testsuite/gcc.target/i386/pr117159.c +new file mode 100644 +index 00000000000..b67d682ecef +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117159.c +@@ -0,0 +1,42 @@ ++/* { dg-do run } */ ++/* { dg-options "-Os -mavx512bw" } */ ++/* { dg-require-effective-target avx512bw } */ ++ ++typedef __attribute__((__vector_size__ (4))) unsigned char W; ++typedef __attribute__((__vector_size__ (64))) int V; ++typedef __attribute__((__vector_size__ (64))) long long Vq; ++ ++W w; ++V v; ++Vq vq; ++ ++static inline W ++foo (short m) ++{ ++ unsigned k = __builtin_ia32_pcmpgtq512_mask ((Vq) { }, vq, m); ++ W r = (W) k + w; ++ return r; ++} ++ ++static inline W ++foo1 (short m) ++{ ++ unsigned k = __builtin_ia32_pcmpgtd512_mask ((V) {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, v, m); ++ W r = (W) k + w; ++ return r; ++} ++ ++int ++main () ++{ ++ if (!__builtin_cpu_supports ("avx512bw")) ++ return 0; ++ W y = foo1 (65535); ++ if (!y[0] || !y[1] || y[2] || y[3]) ++ __builtin_abort(); ++ W x = foo (65535); ++ if (x[0] || x[1] || x[2] || x[3]) ++ __builtin_abort(); ++ ++ return 0; ++} +-- +2.31.1 + diff --git a/INTEL-0042-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch b/INTEL-0042-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch new file mode 100644 index 0000000..7488bb8 --- /dev/null +++ b/INTEL-0042-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch @@ -0,0 +1,95 @@ +From 6e8e4260a895298d27660783aaac45bb2e13941f Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 22 Oct 2024 01:54:40 -0700 +Subject: [PATCH 10/14] Fix ICE due to isa mismatch for the builtins. + +gcc/ChangeLog: + + PR target/117240 + * config/i386/i386-builtin.def: Add avx/avx512f to vaes + ymm/zmm builtins. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117240_avx.c: New test. + * gcc.target/i386/pr117240_avx512f.c: New test. + +(cherry picked from commit 403e361d5aa620e77c9832578b2409a0fdd79d96) +--- + gcc/config/i386/i386-builtin.def | 24 +++++++++---------- + gcc/testsuite/gcc.target/i386/pr117240_avx.c | 10 ++++++++ + .../gcc.target/i386/pr117240_avx512f.c | 10 ++++++++ + 3 files changed, 32 insertions(+), 12 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx.c + create mode 100644 gcc/testsuite/gcc.target/i386/pr117240_avx512f.c + +diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def +index d3ab21eeac3..d1713b70e07 100644 +--- a/gcc/config/i386/i386-builtin.def ++++ b/gcc/config/i386/i386-builtin.def +@@ -2751,18 +2751,18 @@ BDESC (0, OPTION_MASK_ISA2_AVX5124VNNIW, CODE_FOR_avx5124vnniw_vp4dpwssds_mask, + BDESC (0, OPTION_MASK_ISA2_RDPID, CODE_FOR_rdpid, "__builtin_ia32_rdpid", IX86_BUILTIN_RDPID, UNKNOWN, (int) UNSIGNED_FTYPE_VOID) + + /* VAES. */ +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) +-BDESC (0, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v16qi, "__builtin_ia32_vaesdec_v16qi", IX86_BUILTIN_VAESDEC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v32qi, "__builtin_ia32_vaesdec_v32qi", IX86_BUILTIN_VAESDEC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdec_v64qi, "__builtin_ia32_vaesdec_v64qi", IX86_BUILTIN_VAESDEC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v16qi, "__builtin_ia32_vaesdeclast_v16qi", IX86_BUILTIN_VAESDECLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v32qi, "__builtin_ia32_vaesdeclast_v32qi", IX86_BUILTIN_VAESDECLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesdeclast_v64qi, "__builtin_ia32_vaesdeclast_v64qi", IX86_BUILTIN_VAESDECLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v16qi, "__builtin_ia32_vaesenc_v16qi", IX86_BUILTIN_VAESENC16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v32qi, "__builtin_ia32_vaesenc_v32qi", IX86_BUILTIN_VAESENC32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenc_v64qi, "__builtin_ia32_vaesenc_v64qi", IX86_BUILTIN_VAESENC64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) ++BDESC (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v16qi, "__builtin_ia32_vaesenclast_v16qi", IX86_BUILTIN_VAESENCLAST16, UNKNOWN, (int) V16QI_FTYPE_V16QI_V16QI) ++BDESC (OPTION_MASK_ISA_AVX, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v32qi, "__builtin_ia32_vaesenclast_v32qi", IX86_BUILTIN_VAESENCLAST32, UNKNOWN, (int) V32QI_FTYPE_V32QI_V32QI) ++BDESC (OPTION_MASK_ISA_AVX512F, OPTION_MASK_ISA2_VAES, CODE_FOR_vaesenclast_v64qi, "__builtin_ia32_vaesenclast_v64qi", IX86_BUILTIN_VAESENCLAST64, UNKNOWN, (int) V64QI_FTYPE_V64QI_V64QI) + + /* BF16 */ + BDESC (0, OPTION_MASK_ISA2_AVX512BF16, CODE_FOR_avx512f_cvtne2ps2bf16_v32hi, "__builtin_ia32_cvtne2ps2bf16_v32hi", IX86_BUILTIN_CVTNE2PS2HI16_V32HI, UNKNOWN, (int) V32HI_FTYPE_V16SF_V16SF) +diff --git a/gcc/testsuite/gcc.target/i386/pr117240_avx.c b/gcc/testsuite/gcc.target/i386/pr117240_avx.c +new file mode 100644 +index 00000000000..24a97a9f74c +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117240_avx.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mvaes -mno-xsave -Wno-psabi -Wno-implicit-function-declaration" } */ ++ ++typedef __attribute__((__vector_size__(32))) char V; ++ ++V ++foo(V v) ++{ ++ return __builtin_ia32_vaesenc_v32qi(v, v);/* { dg-error "incompatible types when returning" } */ ++} +diff --git a/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c b/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c +new file mode 100644 +index 00000000000..1e7b5a88d7a +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117240_avx512f.c +@@ -0,0 +1,10 @@ ++/* { dg-do compile } */ ++/* { dg-options "-O2 -mvaes -mno-xsave -Wno-psabi -Wno-implicit-function-declaration" } */ ++ ++typedef __attribute__((__vector_size__(64))) char V; ++ ++V ++foo(V v) ++{ ++ return __builtin_ia32_vaesenc_v64qi(v, v);/* { dg-error "incompatible types when returning" } */ ++} +-- +2.31.1 + diff --git a/INTEL-0043-Fix-ICE-due-to-subreg-us_truncate.patch b/INTEL-0043-Fix-ICE-due-to-subreg-us_truncate.patch new file mode 100644 index 0000000..ca4578d --- /dev/null +++ b/INTEL-0043-Fix-ICE-due-to-subreg-us_truncate.patch @@ -0,0 +1,444 @@ +From b817cad361eac1754101114b9beb7abc1aab3435 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 29 Oct 2024 02:09:39 -0700 +Subject: [PATCH 11/14] Fix ICE due to subreg:us_truncate. + +Force_operand issues an ICE when input +is (subreg:DI (us_truncate:V8QI)), it's probably because it's an +invalid rtx, So refine backend patterns for that. + +gcc/ChangeLog: + + PR target/117318 + * config/i386/sse.md (*avx512vl_v2div2qi2_mask_store_1): + Rename to .. + (avx512vl_v2div2qi2_mask_store_1): .. this. + (avx512vl_v2div2qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v4qi2_mask_store_1): Rename to .. + (avx512vl_v4qi2_mask_store_1): .. this. + (avx512vl_v4qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v8qi2_mask_store_1): Rename to .. + (avx512vl_v8qi2_mask_store_1): .. this. + (avx512vl_v8qi2_mask_store_2): Change to + define_expand. + (*avx512vl_v4hi2_mask_store_1): Rename to .. + (avx512vl_v4hi2_mask_store_1): .. this. + (avx512vl_v4hi2_mask_store_2): Change to + define_expand. + (*avx512vl_v2div2hi2_mask_store_1): Rename to .. + (avx512vl_v2div2hi2_mask_store_1): .. this. + (avx512vl_v2div2hi2_mask_store_2): Change to + define_expand. + (*avx512vl_v2div2si2_mask_store_1): Rename to .. + (avx512vl_v2div2si2_mask_store_1): .. this. + (avx512vl_v2div2si2_mask_store_2): Change to + define_expand. + (*avx512f_v8div16qi2_mask_store_1): Rename to .. + (avx512f_v8div16qi2_mask_store_1): .. this. + (avx512f_v8div16qi2_mask_store_2): Change to + define_expand. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr117318.c: New test. + +(cherry picked from commit bc0eeccf27a084461a2d5661e23468350acb43da) +--- + gcc/config/i386/sse.md | 268 +++++++++-------------- + gcc/testsuite/gcc.target/i386/pr117318.c | 12 + + 2 files changed, 110 insertions(+), 170 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117318.c + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index 7d01c00a848..a7d61bf0044 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -13850,7 +13850,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2qi2_mask_store_1" ++(define_insn "avx512vl_v2div2qi2_mask_store_1" + [(set (match_operand:V2QI 0 "memory_operand" "=m") + (vec_merge:V2QI + (any_truncate:V2QI +@@ -13864,28 +13864,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2qi2_mask_store_2" +- [(set (match_operand:HI 0 "memory_operand") +- (subreg:HI +- (vec_merge:V2QI +- (any_truncate:V2QI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2QI +- (subreg:V4QI +- (vec_concat:V2HI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2QI +- (any_truncate:V2QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2QImode, 0);") ++(define_expand "avx512vl_v2div2qi2_mask_store_2" ++ [(match_operand:HI 0 "memory_operand") ++ (any_truncate:V2QI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2QImode, 0); ++ emit_insn (gen_avx512vl_v2div2qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_insn "*avx512vl_v4qi2_store_1" + [(set (match_operand:V4QI 0 "memory_operand" "=m") +@@ -13954,7 +13945,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v4qi2_mask_store_1" ++(define_insn "avx512vl_v4qi2_mask_store_1" + [(set (match_operand:V4QI 0 "memory_operand" "=m") + (vec_merge:V4QI + (any_truncate:V4QI +@@ -13968,29 +13959,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v4qi2_mask_store_2" +- [(set (match_operand:SI 0 "memory_operand") +- (subreg:SI +- (vec_merge:V4QI +- (any_truncate:V4QI +- (match_operand:VI4_128_8_256 1 "register_operand")) +- (vec_select:V4QI +- (subreg:V8QI +- (vec_concat:V2SI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V4QI +- (any_truncate:V4QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V4QImode, 0);") ++(define_expand "avx512vl_v4qi2_mask_store_2" ++ [(match_operand:SI 0 "memory_operand") ++ (any_truncate:V4QI ++ (match_operand:VI4_128_8_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V4QImode, 0); ++ emit_insn (gen_avx512vl_v4qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_mode_iterator VI2_128_BW_4_256 + [(V8HI "TARGET_AVX512BW") V8SI]) +@@ -14062,7 +14043,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v8qi2_mask_store_1" ++(define_insn "avx512vl_v8qi2_mask_store_1" + [(set (match_operand:V8QI 0 "memory_operand" "=m") + (vec_merge:V8QI + (any_truncate:V8QI +@@ -14076,31 +14057,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v8qi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V8QI +- (any_truncate:V8QI +- (match_operand:VI2_128_BW_4_256 1 "register_operand")) +- (vec_select:V8QI +- (subreg:V16QI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3) +- (const_int 4) (const_int 5) +- (const_int 6) (const_int 7)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V8QI +- (any_truncate:V8QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") ++(define_expand "avx512vl_v8qi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V8QI ++ (match_operand:VI2_128_BW_4_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V8QImode, 0); ++ emit_insn (gen_avx512vl_v8qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_mode_iterator PMOV_SRC_MODE_4 [V4DI V2DI V4SI]) + (define_mode_attr pmov_dst_4 +@@ -14222,7 +14191,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v4hi2_mask_store_1" ++(define_insn "avx512vl_v4hi2_mask_store_1" + [(set (match_operand:V4HI 0 "memory_operand" "=m") + (vec_merge:V4HI + (any_truncate:V4HI +@@ -14240,30 +14209,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v4hi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V4HI +- (any_truncate:V4HI +- (match_operand:VI4_128_8_256 1 "register_operand")) +- (vec_select:V4HI +- (subreg:V8HI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V4HI +- (any_truncate:V4HI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V4HImode, 0);") +- ++(define_expand "avx512vl_v4hi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V4HI ++ (match_operand:VI4_128_8_256 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V4HImode, 0); ++ emit_insn (gen_avx512vl_v4hi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_insn "*avx512vl_v2div2hi2_store_1" + [(set (match_operand:V2HI 0 "memory_operand" "=m") +@@ -14324,7 +14282,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2hi2_mask_store_1" ++(define_insn "avx512vl_v2div2hi2_mask_store_1" + [(set (match_operand:V2HI 0 "memory_operand" "=m") + (vec_merge:V2HI + (any_truncate:V2HI +@@ -14338,28 +14296,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2hi2_mask_store_2" +- [(set (match_operand:SI 0 "memory_operand") +- (subreg:SI +- (vec_merge:V2HI +- (any_truncate:V2HI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2HI +- (subreg:V4HI +- (vec_concat:V2SI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2HI +- (any_truncate:V2HI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2HImode, 0);") ++(define_expand "avx512vl_v2div2hi2_mask_store_2" ++ [(match_operand:SI 0 "memory_operand") ++ (any_truncate:V2HI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2HImode, 0); ++ emit_insn (gen_avx512vl_v2div2hi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_expand "truncv2div2si2" + [(set (match_operand:V2SI 0 "register_operand") +@@ -14467,7 +14416,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512vl_v2div2si2_mask_store_1" ++(define_insn "avx512vl_v2div2si2_mask_store_1" + [(set (match_operand:V2SI 0 "memory_operand" "=m") + (vec_merge:V2SI + (any_truncate:V2SI +@@ -14481,28 +14430,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512vl_v2div2si2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V2SI +- (any_truncate:V2SI +- (match_operand:V2DI 1 "register_operand")) +- (vec_select:V2SI +- (subreg:V4SI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512VL && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V2SI +- (any_truncate:V2SI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V2SImode, 0);") ++(define_expand "avx512vl_v2div2si2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V2SI ++ (match_operand:V2DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512VL" ++{ ++ operands[0] = adjust_address_nv (operands[0], V2SImode, 0); ++ emit_insn (gen_avx512vl_v2div2si2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + (define_expand "truncv8div8qi2" + [(set (match_operand:V8QI 0 "register_operand") +@@ -14601,7 +14541,7 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn "*avx512f_v8div16qi2_mask_store_1" ++(define_insn "avx512f_v8div16qi2_mask_store_1" + [(set (match_operand:V8QI 0 "memory_operand" "=m") + (vec_merge:V8QI + (any_truncate:V8QI +@@ -14615,31 +14555,19 @@ + (set_attr "prefix" "evex") + (set_attr "mode" "TI")]) + +-(define_insn_and_split "avx512f_v8div16qi2_mask_store_2" +- [(set (match_operand:DI 0 "memory_operand") +- (subreg:DI +- (vec_merge:V8QI +- (any_truncate:V8QI +- (match_operand:V8DI 1 "register_operand")) +- (vec_select:V8QI +- (subreg:V16QI +- (vec_concat:V2DI +- (match_dup 0) +- (const_int 0)) 0) +- (parallel [(const_int 0) (const_int 1) +- (const_int 2) (const_int 3) +- (const_int 4) (const_int 5) +- (const_int 6) (const_int 7)])) +- (match_operand:QI 2 "register_operand")) 0))] +- "TARGET_AVX512F && ix86_pre_reload_split ()" +- "#" +- "&& 1" +- [(set (match_dup 0) +- (vec_merge:V8QI +- (any_truncate:V8QI (match_dup 1)) +- (match_dup 0) +- (match_dup 2)))] +- "operands[0] = adjust_address_nv (operands[0], V8QImode, 0);") ++(define_expand "avx512f_v8div16qi2_mask_store_2" ++ [(match_operand:DI 0 "memory_operand") ++ (any_truncate:V8QI ++ (match_operand:V8DI 1 "register_operand")) ++ (match_operand:QI 2 "register_operand")] ++ "TARGET_AVX512F" ++{ ++ operands[0] = adjust_address_nv (operands[0], V8QImode, 0); ++ emit_insn (gen_avx512f_v8div16qi2_mask_store_1 (operands[0], ++ operands[1], ++ operands[2])); ++ DONE; ++}) + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; +diff --git a/gcc/testsuite/gcc.target/i386/pr117318.c b/gcc/testsuite/gcc.target/i386/pr117318.c +new file mode 100644 +index 00000000000..3d316ad04cf +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117318.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile } */ ++/* { dg-options "-mavx512f -O" } */ ++ ++typedef __attribute__((__vector_size__ (64))) long long V; ++unsigned long long x; ++ ++unsigned long long ++foo() ++{ ++ __builtin_ia32_pmovusqb512mem_mask (&x, (V){8000000000000000}, 255); ++ return x; ++} +-- +2.31.1 + diff --git a/INTEL-0044-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch b/INTEL-0044-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch new file mode 100644 index 0000000..6fd40e3 --- /dev/null +++ b/INTEL-0044-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch @@ -0,0 +1,104 @@ +From a23da8527f517e439ab634d9995b44740cbbc05b Mon Sep 17 00:00:00 2001 +From: "Hu, Lin1" +Date: Wed, 6 Nov 2024 15:42:13 +0800 +Subject: [PATCH 12/14] i386: Zero extend 32-bit address to 64-bit with option + -mx32 -maddress-mode=long. [PR 117418] + +-maddress-mode=long let Pmode = DI_mode, so zero extend 32-bit address to +64-bit and uses a 64-bit register as a pointer for avoid raise an ICE. + +gcc/ChangeLog: + + PR target/117418 + * config/i386/i386-expand.cc (ix86_expand_builtin): Convert + pointer's mode according to Pmode. + +gcc/testsuite/ChangeLog: + + PR target/117418 + * gcc.target/i386/pr117418-1.c: New test. + +(cherry picked from commit 2272cd2508f1854c880082f792de15e76ec09a99) +--- + gcc/config/i386/i386-expand.cc | 12 +++++++++++ + gcc/testsuite/gcc.target/i386/pr117418-1.c | 24 ++++++++++++++++++++++ + 2 files changed, 36 insertions(+) + create mode 100644 gcc/testsuite/gcc.target/i386/pr117418-1.c + +diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc +index bc2e6198007..52e32749928 100644 +--- a/gcc/config/i386/i386-expand.cc ++++ b/gcc/config/i386/i386-expand.cc +@@ -12730,6 +12730,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + op1 = expand_normal (arg1); + op2 = expand_normal (arg2); + ++ if (GET_MODE (op1) != Pmode) ++ op1 = convert_to_mode (Pmode, op1, 1); ++ + if (!address_operand (op2, VOIDmode)) + { + op2 = convert_memory_address (Pmode, op2); +@@ -12765,6 +12768,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + emit_label (ok_label); + emit_insn (gen_rtx_SET (target, pat)); + ++ if (GET_MODE (op0) != Pmode) ++ op0 = convert_to_mode (Pmode, op0, 1); ++ + for (i = 0; i < 8; i++) + { + op = gen_rtx_MEM (V2DImode, +@@ -12789,6 +12795,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + ++ if (GET_MODE (op2) != Pmode) ++ op2 = convert_to_mode (Pmode, op2, 1); ++ + op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0)); + emit_move_insn (op, op1); + +@@ -12826,6 +12835,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, + if (!REG_P (op0)) + op0 = copy_to_mode_reg (SImode, op0); + ++ if (GET_MODE (op3) != Pmode) ++ op3 = convert_to_mode (Pmode, op3, 1); ++ + /* Force to use xmm0, xmm1 for keylow, keyhi*/ + op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0)); + emit_move_insn (op, op1); +diff --git a/gcc/testsuite/gcc.target/i386/pr117418-1.c b/gcc/testsuite/gcc.target/i386/pr117418-1.c +new file mode 100644 +index 00000000000..4839b139b79 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr117418-1.c +@@ -0,0 +1,24 @@ ++/* PR target/117418 */ ++/* { dg-do compile { target { ! ia32 } } } */ ++/* { dg-options "-maddress-mode=long -mwidekl -mx32" } */ ++/* { dg-require-effective-target maybe_x32 } */ ++/* { dg-final { scan-assembler-times "aesdec128kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesdec256kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesenc128kl" 1 } } */ ++/* { dg-final { scan-assembler-times "aesenc256kl" 1 } } */ ++/* { dg-final { scan-assembler-times "encodekey128" 1 } } */ ++/* { dg-final { scan-assembler-times "encodekey256" 1 } } */ ++ ++typedef __attribute__((__vector_size__(16))) long long V; ++V a; ++ ++void ++foo() ++{ ++ __builtin_ia32_aesdec128kl_u8 (&a, a, &a); ++ __builtin_ia32_aesdec256kl_u8 (&a, a, &a); ++ __builtin_ia32_aesenc128kl_u8 (&a, a, &a); ++ __builtin_ia32_aesenc256kl_u8 (&a, a, &a); ++ __builtin_ia32_encodekey128_u32 (0, a, &a); ++ __builtin_ia32_encodekey256_u32 (0, a, a, &a); ++} +-- +2.31.1 + diff --git a/INTEL-0045-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch b/INTEL-0045-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch new file mode 100644 index 0000000..b7e6f15 --- /dev/null +++ b/INTEL-0045-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch @@ -0,0 +1,37 @@ +From 94ab46d9486464b3158a9fc9bc1c463dd4d62d72 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Thu, 21 Nov 2024 23:57:38 -0800 +Subject: [PATCH 13/14] Fix uninitialized operands[2] in vec_unpacks_hi_v4sf. + +It could cause weired spill in RA when register pressure is high. + +gcc/ChangeLog: + + PR target/117562 + * config/i386/sse.md (vec_unpacks_hi_v4sf): Initialize + operands[2] with CONST0_RTX. + +(cherry picked from commit ba4cf2e296d8d5950c3d356fa6b6efcad00d0189) +--- + gcc/config/i386/sse.md | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md +index a7d61bf0044..c6a8e301145 100644 +--- a/gcc/config/i386/sse.md ++++ b/gcc/config/i386/sse.md +@@ -9126,7 +9126,10 @@ + (match_dup 2) + (parallel [(const_int 0) (const_int 1)]))))] + "TARGET_SSE2" +- "operands[2] = gen_reg_rtx (V4SFmode);") ++{ ++ operands[2] = gen_reg_rtx (V4SFmode); ++ emit_move_insn (operands[2], CONST0_RTX (V4SFmode)); ++}) + + (define_expand "vec_unpacks_hi_v8sf" + [(set (match_dup 2) +-- +2.31.1 + diff --git a/INTEL-0046-GCC13-GCC12-Fix-testcase.patch b/INTEL-0046-GCC13-GCC12-Fix-testcase.patch new file mode 100644 index 0000000..4d746f2 --- /dev/null +++ b/INTEL-0046-GCC13-GCC12-Fix-testcase.patch @@ -0,0 +1,34 @@ +From 6494fd12311561551bcf8d8529108fba79c45fd7 Mon Sep 17 00:00:00 2001 +From: liuhongt +Date: Tue, 22 Oct 2024 11:24:23 +0800 +Subject: [PATCH 14/14] [GCC13/GCC12] Fix testcase. + +The optimization relies on other patterns which are only available at +GCC14 and obove, so restore the xfail for GCC13/12 branch. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/avx512bw-pr103750-2.c: Add xfail for ia32. + +(cherry picked from commit 8b43518a01cbbbafe042b85a48fa09a32948380a) +--- + gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +index 3392e193222..7303f5403ba 100644 +--- a/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c ++++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr103750-2.c +@@ -1,7 +1,8 @@ + /* PR target/103750 */ + /* { dg-do compile } */ + /* { dg-options "-O2 -mavx512dq -mavx512bw -mavx512vl" } */ +-/* { dg-final { scan-assembler-not "kmov" } } */ ++/* { dg-final { scan-assembler-not "kmov" { xfail ia32 } } } */ ++/* xfail need to be fixed. */ + + #include + extern __m128i* pi128; +-- +2.31.1 + diff --git a/INTEL-0047-i386-Remove-CLDEMOTE-for-clients.patch b/INTEL-0047-i386-Remove-CLDEMOTE-for-clients.patch new file mode 100644 index 0000000..2c7cc95 --- /dev/null +++ b/INTEL-0047-i386-Remove-CLDEMOTE-for-clients.patch @@ -0,0 +1,54 @@ +From 5eb102d763159435648ba4b80d39ff6f3368e68d Mon Sep 17 00:00:00 2001 +From: Haochen Jiang +Date: Wed, 25 Jun 2025 11:04:11 +0800 +Subject: [PATCH] i386: Remove CLDEMOTE for clients + +CLDEMOTE is not enabled on clients according to SDM. SDM only mentioned +it will be enabled on Xeon and Atom servers, not clients. Remove them +since Alder Lake (where it is introduced). + +gcc/ChangeLog: + + * config/i386/i386.h (PTA_ALDERLAKE): Use PTA_GOLDMONT_PLUS + as base to remove PTA_CLDEMOTE. + * doc/invoke.texi: Update texi file. + +(cherry picked from commit ee04d1554aa87d6155bd96e38c91351871273b51) +--- + gcc/config/i386/i386.h | 3 ++- + gcc/doc/invoke.texi | 5 ++--- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h +index 4bcbcc781d6..cef05da3905 100644 +--- a/gcc/config/i386/i386.h ++++ b/gcc/config/i386/i386.h +@@ -2357,7 +2357,8 @@ constexpr wide_int_bitmask PTA_GOLDMONT_PLUS = PTA_GOLDMONT | PTA_RDPID + | PTA_SGX | PTA_PTWRITE; + constexpr wide_int_bitmask PTA_TREMONT = PTA_GOLDMONT_PLUS | PTA_CLWB + | PTA_GFNI | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_CLDEMOTE | PTA_WAITPKG; +-constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_TREMONT | PTA_ADX | PTA_AVX ++constexpr wide_int_bitmask PTA_ALDERLAKE = PTA_GOLDMONT_PLUS | PTA_CLWB ++ | PTA_GFNI | PTA_MOVDIRI | PTA_MOVDIR64B | PTA_WAITPKG | PTA_ADX | PTA_AVX + | PTA_AVX2 | PTA_BMI | PTA_BMI2 | PTA_F16C | PTA_FMA | PTA_LZCNT + | PTA_PCONFIG | PTA_PKU | PTA_VAES | PTA_VPCLMULQDQ | PTA_SERIALIZE + | PTA_HRESET | PTA_KL | PTA_WIDEKL | PTA_AVXVNNI; +diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi +index 3f20cd3b449..f1f0d7057cc 100644 +--- a/gcc/doc/invoke.texi ++++ b/gcc/doc/invoke.texi +@@ -31551,9 +31551,8 @@ VPCLMULQDQ, AVX512BITALG, RDPID and AVX512VPOPCNTDQ instruction set support. + Intel Alder Lake CPU with 64-bit extensions, MOVBE, MMX, SSE, SSE2, SSE3, SSSE3, + SSE4.1, SSE4.2, POPCNT, AES, PREFETCHW, PCLMUL, RDRND, XSAVE, XSAVEC, XSAVES, + XSAVEOPT, FSGSBASE, PTWRITE, RDPID, SGX, GFNI-SSE, CLWB, MOVDIRI, MOVDIR64B, +-CLDEMOTE, WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, +-VAES, VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set +-support. ++WAITPKG, ADCX, AVX, AVX2, BMI, BMI2, F16C, FMA, LZCNT, PCONFIG, PKU, VAES, ++VPCLMULQDQ, SERIALIZE, HRESET, KL, WIDEKL and AVX-VNNI instruction set support. + + @item sapphirerapids + Intel Sapphire Rapids CPU with 64-bit extensions, MMX, SSE, SSE2, SSE3, SSSE3, +-- +2.31.1 + diff --git a/INTEL-0048-c-signed-__int128_t-PR108099.patch b/INTEL-0048-c-signed-__int128_t-PR108099.patch new file mode 100644 index 0000000..6cb1993 --- /dev/null +++ b/INTEL-0048-c-signed-__int128_t-PR108099.patch @@ -0,0 +1,59 @@ +From 9b24448ca8804b5bb99ef6eeba942f80663fca89 Mon Sep 17 00:00:00 2001 +From: Jason Merrill +Date: Thu, 9 Mar 2023 17:35:24 -0500 +Subject: [PATCH 1/4] c++: signed __int128_t [PR108099] + +The code for handling signed + typedef was breaking on __int128_t, because +it isn't a proper typedef: it doesn't have DECL_ORIGINAL_TYPE. + + PR c++/108099 + +gcc/cp/ChangeLog: + + * decl.cc (grokdeclarator): Handle non-typedef typedef_decl. + +gcc/testsuite/ChangeLog: + + * g++.dg/ext/int128-7.C: New test. +--- + gcc/cp/decl.cc | 11 ++++++++--- + gcc/testsuite/g++.dg/ext/int128-7.C | 4 ++++ + 2 files changed, 12 insertions(+), 3 deletions(-) + create mode 100644 gcc/testsuite/g++.dg/ext/int128-7.C + +diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc +index ab9a009fb7e..cbb9f20b7a4 100644 +--- a/gcc/cp/decl.cc ++++ b/gcc/cp/decl.cc +@@ -12300,10 +12300,15 @@ grokdeclarator (const cp_declarator *declarator, + { + if (typedef_decl) + { +- pedwarn (loc, OPT_Wpedantic, "%qs specified with %qT", +- key, type); ++ pedwarn (loc, OPT_Wpedantic, "%qs specified with %qD", ++ key, typedef_decl); + ok = !flag_pedantic_errors; +- type = DECL_ORIGINAL_TYPE (typedef_decl); ++ if (is_typedef_decl (typedef_decl)) ++ type = DECL_ORIGINAL_TYPE (typedef_decl); ++ else ++ /* PR108099: __int128_t comes from c_common_nodes_and_builtins, ++ and is not built as a typedef. */ ++ type = TREE_TYPE (typedef_decl); + typedef_decl = NULL_TREE; + } + else if (declspecs->decltype_p) +diff --git a/gcc/testsuite/g++.dg/ext/int128-7.C b/gcc/testsuite/g++.dg/ext/int128-7.C +new file mode 100644 +index 00000000000..bf5e8c40a4b +--- /dev/null ++++ b/gcc/testsuite/g++.dg/ext/int128-7.C +@@ -0,0 +1,4 @@ ++// PR c++/108099 ++// { dg-do compile { target { c++11 && int128 } } } ++ ++using i128 = signed __int128_t; // { dg-error "specified with" } +-- +2.31.1 + diff --git a/INTEL-0049-c-fix-unsigned-__int128_t-semantics-PR108099.patch b/INTEL-0049-c-fix-unsigned-__int128_t-semantics-PR108099.patch new file mode 100644 index 0000000..332084c --- /dev/null +++ b/INTEL-0049-c-fix-unsigned-__int128_t-semantics-PR108099.patch @@ -0,0 +1,78 @@ +From 1b6cbcd4ee3d8f4cf9b6e589032d08f0b3a3db42 Mon Sep 17 00:00:00 2001 +From: Jason Merrill +Date: Tue, 18 Apr 2023 17:12:17 -0400 +Subject: [PATCH 2/4] c++: fix 'unsigned __int128_t' semantics [PR108099] + +My earlier patch for 108099 made us accept this non-standard pattern but +messed up the semantics, so that e.g. unsigned __int128_t was not a 128-bit +type. + + PR c++/108099 + +gcc/cp/ChangeLog: + + * decl.cc (grokdeclarator): Keep typedef_decl for __int128_t. + +gcc/testsuite/ChangeLog: + + * g++.dg/ext/int128-8.C: New test. +--- + gcc/cp/decl.cc | 6 ++++-- + gcc/testsuite/g++.dg/ext/int128-8.C | 24 ++++++++++++++++++++++++ + 2 files changed, 28 insertions(+), 2 deletions(-) + create mode 100644 gcc/testsuite/g++.dg/ext/int128-8.C + +diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc +index cbb9f20b7a4..f514377dd42 100644 +--- a/gcc/cp/decl.cc ++++ b/gcc/cp/decl.cc +@@ -12304,12 +12304,14 @@ grokdeclarator (const cp_declarator *declarator, + key, typedef_decl); + ok = !flag_pedantic_errors; + if (is_typedef_decl (typedef_decl)) +- type = DECL_ORIGINAL_TYPE (typedef_decl); ++ { ++ type = DECL_ORIGINAL_TYPE (typedef_decl); ++ typedef_decl = NULL_TREE; ++ } + else + /* PR108099: __int128_t comes from c_common_nodes_and_builtins, + and is not built as a typedef. */ + type = TREE_TYPE (typedef_decl); +- typedef_decl = NULL_TREE; + } + else if (declspecs->decltype_p) + error_at (loc, "%qs specified with %", key); +diff --git a/gcc/testsuite/g++.dg/ext/int128-8.C b/gcc/testsuite/g++.dg/ext/int128-8.C +new file mode 100644 +index 00000000000..14bbc49f5c3 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/ext/int128-8.C +@@ -0,0 +1,24 @@ ++// PR c++/108099 ++// { dg-do compile { target c++11 } } ++// { dg-options "" } ++ ++using u128 = unsigned __int128_t; ++using s128 = signed __int128_t; ++template struct integral_constant { ++ static constexpr T value = v; ++}; ++typedef integral_constant false_type; ++typedef integral_constant true_type; ++template ++struct is_same : false_type {}; ++template ++struct is_same : true_type {}; ++static_assert (is_same <__int128, s128>::value, ""); ++static_assert (is_same ::value, ""); ++static_assert (is_same <__int128_t, s128>::value, ""); ++static_assert (is_same ::value, ""); // { dg-bogus "" "" { xfail *-*-* } } ++static_assert (is_same <__uint128_t, u128>::value, ""); // { dg-bogus "" "" { xfail *-*-* } } ++static_assert (sizeof (s128) == sizeof (__int128), ""); ++static_assert (sizeof (u128) == sizeof (unsigned __int128), ""); ++static_assert (s128(-1) < 0, ""); ++static_assert (u128(-1) > 0, ""); +-- +2.31.1 + diff --git a/INTEL-0050-testsuite-Fix-up-g-.dg-ext-int128-8.C-testcase-PR109.patch b/INTEL-0050-testsuite-Fix-up-g-.dg-ext-int128-8.C-testcase-PR109.patch new file mode 100644 index 0000000..ff393dd --- /dev/null +++ b/INTEL-0050-testsuite-Fix-up-g-.dg-ext-int128-8.C-testcase-PR109.patch @@ -0,0 +1,34 @@ +From 947b0828d5cb48939b044d2ed12679fe2890daba Mon Sep 17 00:00:00 2001 +From: Jakub Jelinek +Date: Thu, 20 Apr 2023 09:43:04 +0200 +Subject: [PATCH 3/4] testsuite: Fix up g++.dg/ext/int128-8.C testcase + [PR109560] + +The testcase needs to be restricted to int128 effective targets, +it expectedly fails on i386 and other 32-bit targets. + +2023-04-20 Jakub Jelinek + + PR c++/108099 + PR testsuite/109560 + * g++.dg/ext/int128-8.C: Require int128 effective target. + +(cherry picked from commit bd4a1a547242a924663712ac7a13799433cdf476) +--- + gcc/testsuite/g++.dg/ext/int128-8.C | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/gcc/testsuite/g++.dg/ext/int128-8.C b/gcc/testsuite/g++.dg/ext/int128-8.C +index 14bbc49f5c3..7e909d50873 100644 +--- a/gcc/testsuite/g++.dg/ext/int128-8.C ++++ b/gcc/testsuite/g++.dg/ext/int128-8.C +@@ -1,5 +1,5 @@ + // PR c++/108099 +-// { dg-do compile { target c++11 } } ++// { dg-do compile { target { c++11 && int128 } } } + // { dg-options "" } + + using u128 = unsigned __int128_t; +-- +2.31.1 + diff --git a/INTEL-0051-c-fix-unsigned-typedef-name-extension-PR108099.patch b/INTEL-0051-c-fix-unsigned-typedef-name-extension-PR108099.patch new file mode 100644 index 0000000..0a48f1d --- /dev/null +++ b/INTEL-0051-c-fix-unsigned-typedef-name-extension-PR108099.patch @@ -0,0 +1,172 @@ +From bd9fecc1fb91f9cfab1d7d5869fab9e9a828c6bd Mon Sep 17 00:00:00 2001 +From: Jason Merrill +Date: Tue, 18 Apr 2023 21:32:07 -0400 +Subject: [PATCH 4/4] c++: fix 'unsigned typedef-name' extension [PR108099] + +In the comments for PR108099 Jakub provided some testcases that demonstrated +that even before the regression noted in the patch we were getting the +semantics of this extension wrong: in the unsigned case we weren't producing +the corresponding standard unsigned type but another distinct one of the +same size, and in the signed case we were just dropping it on the floor and +not actually returning a signed type at all. + +The former issue is fixed by using c_common_signed_or_unsigned_type instead +of unsigned_type_for, and the latter issue by adding a (signed_p && +typedef_decl) case. + +This patch introduces a failure on std/ranges/iota/max_size_type.cc due to +the latter issue, since the testcase expects 'signed rep_t' to do something +sensible, and previously we didn't. Now that we do, it exposes a bug in the +__max_diff_type::operator>>= handling of sign extension: when we evaluate +-1000 >> 2 in __max_diff_type we keep the MSB set, but leave the +second-most-significant bit cleared. + + PR c++/108099 + +gcc/cp/ChangeLog: + + * decl.cc (grokdeclarator): Don't clear typedef_decl after 'unsigned + typedef' pedwarn. Use c_common_signed_or_unsigned_type. Also + handle 'signed typedef'. + +gcc/testsuite/ChangeLog: + + * g++.dg/ext/int128-7.C: New test. + * g++.dg/ext/int128-8.C: New test. + * g++.dg/ext/unsigned-typedef2.C: New test. + * g++.dg/ext/unsigned-typedef3.C: New test. +--- + gcc/cp/decl.cc | 18 +++++++------- + gcc/testsuite/g++.dg/ext/int128-8.C | 4 ++-- + gcc/testsuite/g++.dg/ext/unsigned-typedef2.C | 25 ++++++++++++++++++++ + gcc/testsuite/g++.dg/ext/unsigned-typedef3.C | 25 ++++++++++++++++++++ + 4 files changed, 60 insertions(+), 12 deletions(-) + create mode 100644 gcc/testsuite/g++.dg/ext/unsigned-typedef2.C + create mode 100644 gcc/testsuite/g++.dg/ext/unsigned-typedef3.C + +diff --git a/gcc/cp/decl.cc b/gcc/cp/decl.cc +index f514377dd42..5876394247a 100644 +--- a/gcc/cp/decl.cc ++++ b/gcc/cp/decl.cc +@@ -12300,18 +12300,14 @@ grokdeclarator (const cp_declarator *declarator, + { + if (typedef_decl) + { +- pedwarn (loc, OPT_Wpedantic, "%qs specified with %qD", ++ pedwarn (loc, OPT_Wpedantic, ++ "%qs specified with typedef-name %qD", + key, typedef_decl); + ok = !flag_pedantic_errors; ++ /* PR108099: __int128_t comes from c_common_nodes_and_builtins, ++ and is not built as a typedef. */ + if (is_typedef_decl (typedef_decl)) +- { +- type = DECL_ORIGINAL_TYPE (typedef_decl); +- typedef_decl = NULL_TREE; +- } +- else +- /* PR108099: __int128_t comes from c_common_nodes_and_builtins, +- and is not built as a typedef. */ +- type = TREE_TYPE (typedef_decl); ++ type = DECL_ORIGINAL_TYPE (typedef_decl); + } + else if (declspecs->decltype_p) + error_at (loc, "%qs specified with %", key); +@@ -12364,7 +12360,7 @@ grokdeclarator (const cp_declarator *declarator, + else if (type == char_type_node) + type = unsigned_char_type_node; + else if (typedef_decl) +- type = unsigned_type_for (type); ++ type = c_common_unsigned_type (type); + else + type = unsigned_type_node; + } +@@ -12378,6 +12374,8 @@ grokdeclarator (const cp_declarator *declarator, + type = long_integer_type_node; + else if (short_p) + type = short_integer_type_node; ++ else if (signed_p && typedef_decl) ++ type = c_common_signed_type (type); + + if (decl_spec_seq_has_spec_p (declspecs, ds_complex)) + { +diff --git a/gcc/testsuite/g++.dg/ext/int128-8.C b/gcc/testsuite/g++.dg/ext/int128-8.C +index 7e909d50873..07535a9820e 100644 +--- a/gcc/testsuite/g++.dg/ext/int128-8.C ++++ b/gcc/testsuite/g++.dg/ext/int128-8.C +@@ -16,8 +16,8 @@ struct is_same : true_type {}; + static_assert (is_same <__int128, s128>::value, ""); + static_assert (is_same ::value, ""); + static_assert (is_same <__int128_t, s128>::value, ""); +-static_assert (is_same ::value, ""); // { dg-bogus "" "" { xfail *-*-* } } +-static_assert (is_same <__uint128_t, u128>::value, ""); // { dg-bogus "" "" { xfail *-*-* } } ++static_assert (is_same ::value, ""); ++static_assert (is_same <__uint128_t, u128>::value, ""); + static_assert (sizeof (s128) == sizeof (__int128), ""); + static_assert (sizeof (u128) == sizeof (unsigned __int128), ""); + static_assert (s128(-1) < 0, ""); +diff --git a/gcc/testsuite/g++.dg/ext/unsigned-typedef2.C b/gcc/testsuite/g++.dg/ext/unsigned-typedef2.C +new file mode 100644 +index 00000000000..936c0ccb748 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/ext/unsigned-typedef2.C +@@ -0,0 +1,25 @@ ++// PR c++/108099 ++// { dg-do compile { target c++11 } } ++// { dg-options "" } ++ ++typedef long long t64; ++template struct integral_constant { ++ static constexpr T value = v; ++}; ++typedef integral_constant false_type; ++typedef integral_constant true_type; ++template ++struct is_same : false_type {}; ++template ++struct is_same : true_type {}; ++ ++using s64 = signed t64; ++static_assert (is_same ::value, ""); ++static_assert (is_same ::value, ""); ++static_assert (sizeof (s64) == sizeof (long long), ""); ++static_assert (s64(-1) < 0, ""); ++ ++using u64 = unsigned t64; ++static_assert (is_same ::value, ""); ++static_assert (sizeof (u64) == sizeof (unsigned long long), ""); ++static_assert (u64(-1) > 0, ""); +diff --git a/gcc/testsuite/g++.dg/ext/unsigned-typedef3.C b/gcc/testsuite/g++.dg/ext/unsigned-typedef3.C +new file mode 100644 +index 00000000000..bb99ca0ccc9 +--- /dev/null ++++ b/gcc/testsuite/g++.dg/ext/unsigned-typedef3.C +@@ -0,0 +1,25 @@ ++// PR c++/108099 ++// { dg-do compile { target c++11 } } ++// { dg-options "" } ++ ++typedef unsigned long long t64; ++template struct integral_constant { ++ static constexpr T value = v; ++}; ++typedef integral_constant false_type; ++typedef integral_constant true_type; ++template ++struct is_same : false_type {}; ++template ++struct is_same : true_type {}; ++ ++using s64 = signed t64; ++static_assert (is_same ::value, ""); ++static_assert (is_same ::value, ""); ++static_assert (sizeof (s64) == sizeof (long long), ""); ++static_assert (s64(-1) < 0, ""); ++ ++using u64 = unsigned t64; ++static_assert (is_same ::value, ""); ++static_assert (sizeof (u64) == sizeof (unsigned long long), ""); ++static_assert (u64(-1) > 0, ""); +-- +2.31.1 + diff --git a/gcc.spec b/gcc.spec index d5e31ba..08c0bdb 100644 --- a/gcc.spec +++ b/gcc.spec @@ -1,4 +1,4 @@ -%define anolis_release 15 +%define anolis_release 16 %global DATE 20221121 %global gitrev b3f5a0d53b84ed27cf00cfa2b9c3e2c78935c07d @@ -518,6 +518,27 @@ Patch6027: INTEL-0027-i386-Remove-Meteorlake-s-family_model.patch Patch6028: INTEL-0028-x86-Update-model-values-for-Alderlake-Rocketlake-and.patch Patch6029: INTEL-0029-x86-Update-model-values-for-Raptorlake.patch Patch6030: INTEL-0030-Fix-target_clone-arch-graniterapids-d.patch +Patch6031: INTEL-0031-i386-Change-prefetchi-output-template.patch +Patch6032: INTEL-0032-i386-Add-non-optimize-prefetchi-intrins.patch +Patch6033: INTEL-0033-i386-Fix-AVX512-intrin-macro-typo.patch +Patch6034: INTEL-0034-i386-Use-_mm_setzero_ps-d-instead-of-_mm_avx512_setz.patch +Patch6035: INTEL-0035-Refine-constraint-Bk-to-define_special_memory_constr.patch +Patch6036: INTEL-0036-Align-ix86_-move_max-store_max-with-vectorizer.patch +Patch6037: INTEL-0037-Fix-testcase-failure.patch +Patch6038: INTEL-0038-Check-avx-upper-register-for-parallel.patch +Patch6039: INTEL-0039-i386-Fix-vfpclassph-non-optimizied-intrin.patch +Patch6040: INTEL-0040-doc-Add-more-alias-option-and-reorder-Intel-CPU-marc.patch +Patch6041: INTEL-0041-Refine-splitters-related-to-combine-vpcmpuw-zero_ext.patch +Patch6042: INTEL-0042-Fix-ICE-due-to-isa-mismatch-for-the-builtins.patch +Patch6043: INTEL-0043-Fix-ICE-due-to-subreg-us_truncate.patch +Patch6044: INTEL-0044-i386-Zero-extend-32-bit-address-to-64-bit-with-optio.patch +Patch6045: INTEL-0045-Fix-uninitialized-operands-2-in-vec_unpacks_hi_v4sf.patch +Patch6046: INTEL-0046-GCC13-GCC12-Fix-testcase.patch +Patch6047: INTEL-0047-i386-Remove-CLDEMOTE-for-clients.patch +Patch6048: INTEL-0048-c-signed-__int128_t-PR108099.patch +Patch6049: INTEL-0049-c-fix-unsigned-__int128_t-semantics-PR108099.patch +Patch6050: INTEL-0050-testsuite-Fix-up-g-.dg-ext-int128-8.C-testcase-PR109.patch +Patch6051: INTEL-0051-c-fix-unsigned-typedef-name-extension-PR108099.patch # Part 7001 ~ 7999 Patch7001: ZHAOXIN-0001-Backport-zhaoxin-lujiazui-yongfeng-shijidadao-enable.patch @@ -1421,6 +1442,27 @@ The %{name}-doc package contains documentation files for %{name}. %patch6028 -p1 %patch6029 -p1 %patch6030 -p1 +%patch6031 -p1 +%patch6032 -p1 +%patch6033 -p1 +%patch6034 -p1 +%patch6035 -p1 +%patch6036 -p1 +%patch6037 -p1 +%patch6038 -p1 +%patch6039 -p1 +%patch6040 -p1 +%patch6041 -p1 +%patch6042 -p1 +%patch6043 -p1 +%patch6044 -p1 +%patch6045 -p1 +%patch6046 -p1 +%patch6047 -p1 +%patch6048 -p1 +%patch6049 -p1 +%patch6050 -p1 +%patch6051 -p1 %endif %ifarch x86_64 %patch7001 -p1 @@ -2912,6 +2954,10 @@ end %changelog +* Fri Oct 10 2025 Hu Lin 12.3.0-16 +- Intel: Backport some patches from GCC-12, include some + optimization patches and some fixed patches. + * Tue Sep 12 2025 timhu_806d - 12.3.0-15 - Backport Zhaoxin lujiazui,yongfeng,shijidadao enablements from community's trunk -- Gitee