From c5970536c2caa3980bb1fded812ac0dc8ebf3681 Mon Sep 17 00:00:00 2001
From: liyunfei <liyunfei33@huawei.com>
Date: Fri, 18 Apr 2025 14:44:24 +0800
Subject: [PATCH] Add hip12 core definition and cost model

This adds a cost model and core definition for hip12.

Signed-off-by: liyunfei <liyunfei33@huawei.com>
---
 gcc/config/aarch64/aarch64-cores.def     |   1 +
 gcc/config/aarch64/aarch64-cost-tables.h | 108 ++++++++++++
 gcc/config/aarch64/aarch64-tune.md       |   2 +-
 gcc/config/aarch64/aarch64.cc            | 209 +++++++++++++++++++++++
 gcc/doc/invoke.texi                      |   3 +-
 5 files changed, 321 insertions(+), 2 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index 8f62103979d..97d3c5df9cd 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -179,4 +179,5 @@ AARCH64_CORE("hip11", hip11, hip11, V8_5A,  (SVE, SVE2, F16), hip11, 0x48, 0xd22
 AARCH64_CORE("demeter", demeter, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 AARCH64_CORE("neoverse-v2", neoversev2, cortexa57, V9A, (I8MM, BF16, SVE2_BITPERM, RNG, MEMTAG, PROFILE), neoversev2, 0x41, 0xd4f, -1)
 
+AARCH64_CORE("hip12", hip12, hip12, V9_2A, (SVE, SVE2, SVE2_BITPERM, SVE2_AES, SVE2_SM4, SVE2_SHA3, F16, RCPC, BF16, DOTPROD, LSE, SIMD, PAUTH, RDMA, LS64), hip12, 0x4e, 0xd06, -1)
 #undef AARCH64_CORE
diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h
index a39ace9ba87..d59ef10bee4 100644
--- a/gcc/config/aarch64/aarch64-cost-tables.h
+++ b/gcc/config/aarch64/aarch64-cost-tables.h
@@ -665,6 +665,114 @@ const struct cpu_cost_table hip11_extra_costs =
   }
 };
 
+const struct cpu_cost_table hip12_extra_costs =
+{
+  /* ALU */
+  {
+    0,                 /* arith.  */
+    0,                 /* logical.  */
+    0,                 /* shift.  */
+    0,                 /* shift_reg.  */
+    0,                 /* arith_shift.  */
+    0,                 /* arith_shift_reg.  */
+    COSTS_N_INSNS (1), /* log_shift.  */
+    COSTS_N_INSNS (1), /* log_shift_reg.  */
+    COSTS_N_INSNS (1), /* extend.  */
+    0,                 /* extend_arith.  */
+    0,                 /* bfi.  */
+    0,                 /* bfx.  */
+    0,                 /* clz.  */
+    0,                 /* rev.  */
+    0,                 /* non_exec.  */
+    true               /* non_exec_costs_exec.  */
+  },
+
+  {
+    /* MULT SImode */
+    {
+      COSTS_N_INSNS (2),       /* simple.  */
+      0,                       /* flag_setting.  */
+      COSTS_N_INSNS (2),       /* extend.  */
+      COSTS_N_INSNS (2),       /* add.  */
+      COSTS_N_INSNS (2),       /* extend_add.  */
+      COSTS_N_INSNS (5)       /* idiv.  */
+    },
+    /* MULT DImode */
+    {
+      COSTS_N_INSNS (3),       /* simple.  */
+      0,                       /* flag_setting (N/A).  */
+      COSTS_N_INSNS (3),       /* extend.  */
+      COSTS_N_INSNS (3),       /* add.  */
+      COSTS_N_INSNS (3),       /* extend_add.  */
+      COSTS_N_INSNS (7)       /* idiv.  */
+    }
+  },
+  /* LD/ST */
+  {
+    COSTS_N_INSNS (3),         /* load.  */
+    COSTS_N_INSNS (4),         /* load_sign_extend.  */
+    COSTS_N_INSNS (3),         /* ldrd.  */
+    COSTS_N_INSNS (3),         /* ldm_1st.  */
+    1,                         /* ldm_regs_per_insn_1st.  */
+    2,                         /* ldm_regs_per_insn_subsequent.  */
+    COSTS_N_INSNS (5),         /* loadf.  */
+    COSTS_N_INSNS (5),         /* loadd.  */
+    COSTS_N_INSNS (4),         /* load_unaligned.  */
+    0,                         /* store.  */
+    0,                         /* strd.  */
+    0,                         /* stm_1st.  */
+    1,                         /* stm_regs_per_insn_1st.  */
+    2,                         /* stm_regs_per_insn_subsequent.  */
+    0,                         /* storef.  */
+    0,                         /* stored.  */
+    COSTS_N_INSNS (1),         /* store_unaligned.  */
+    COSTS_N_INSNS (5),         /* loadv.  */
+    COSTS_N_INSNS (2)          /* storev.  */
+  },
+  {
+    /* FP SFmode */
+    {
+      COSTS_N_INSNS (5),      /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      COSTS_N_INSNS (1),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (2),       /* toint.  */
+      COSTS_N_INSNS (3),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    },
+    /* FP DFmode */
+    {
+      COSTS_N_INSNS (7),      /* div.  */
+      COSTS_N_INSNS (2),       /* mult.  */
+      COSTS_N_INSNS (4),       /* mult_addsub.  */
+      COSTS_N_INSNS (3),       /* fma.  */
+      COSTS_N_INSNS (1),       /* addsub.  */
+      COSTS_N_INSNS (1),       /* fpconst.  */
+      0,                       /* neg.  */
+      COSTS_N_INSNS (1),       /* compare.  */
+      COSTS_N_INSNS (2),       /* widen.  */
+      COSTS_N_INSNS (2),       /* narrow.  */
+      COSTS_N_INSNS (2),       /* toint.  */
+      COSTS_N_INSNS (3),       /* fromint.  */
+      COSTS_N_INSNS (2)        /* roundint.  */
+    }
+  },
+  /* Vector */
+  {
+    COSTS_N_INSNS (1),  /* alu.  */
+    COSTS_N_INSNS (2),  /* mult.  */
+    COSTS_N_INSNS (1),  /* movi.  */
+    COSTS_N_INSNS (1),  /* dup.  */
+    COSTS_N_INSNS (1)   /* extract.  */
+  }
+};
+
 const struct cpu_cost_table a64fx_extra_costs =
 {
   /* ALU */
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 1cfa3559d21..488e39b7cbb 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,hip09,hip10a,hip10c,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,hip11,demeter,neoversev2"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,ampere1,ampere1a,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa78,cortexa78ae,cortexa78c,cortexa65,cortexa65ae,cortexx1,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,a64fx,tsv110,hip09,hip10a,hip10c,thunderx3t110,zeus,neoversev1,neoverse512tvb,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55,cortexr82,cortexa510,cortexa710,cortexx2,neoversen2,hip11,demeter,neoversev2,hip12"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index 9f1fbf970ba..c4e2eba0158 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -561,6 +561,24 @@ static const struct cpu_addrcost_table hip11_addrcost_table =
   0, /* imm_offset  */
 };
 
+static const struct cpu_addrcost_table hip12_addrcost_table =
+{
+    {
+      1, /* hi  */
+      0, /* si  */
+      0, /* di  */
+      1, /* ti  */
+    },
+  0, /* pre_modify  */
+  0, /* post_modify  */
+  2, /* post_modify_ld3_st3  */
+  2, /* post_modify_ld4_st4  */
+  0, /* register_offset  */
+  0, /* register_sextend  */
+  0, /* register_zextend  */
+  0, /* imm_offset  */
+};
+
 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 {
     {
@@ -756,6 +774,16 @@ static const struct cpu_regmove_cost hip11_regmove_cost =
   2  /* FP2FP  */
 };
 
+static const struct cpu_regmove_cost hip12_regmove_cost =
+{
+  1, /* GP2GP  */
+  /* Avoid the use of slow int<->fp moves for spilling by setting
+     their cost higher than memmov_cost.  */
+  6, /* GP2FP  */
+  2, /* FP2GP  */
+  2  /* FP2FP  */
+};
+
 static const struct cpu_regmove_cost a64fx_regmove_cost =
 {
   1, /* GP2GP  */
@@ -1231,6 +1259,143 @@ static const struct cpu_vector_cost hip11_vector_cost =
   nullptr /* issue_info  */
 };
 
+static const advsimd_vec_cost hip12_advsimd_vector_cost =
+{
+  2, /* int_stmt_cost  */
+  2, /* fp_stmt_cost  */
+  2, /* ld2_st2_permute_cost  */
+  2, /* ld3_st3_permute_cost  */
+  3, /* ld4_st4_permute_cost  */
+  2, /* permute_cost  */
+  9, /* reduc_i8_cost  */
+  7, /* reduc_i16_cost  */
+  5, /* reduc_i32_cost  */
+  3, /* reduc_i64_cost  */
+  3, /* reduc_f16_cost  */
+  3, /* reduc_f32_cost  */
+  3, /* reduc_f64_cost  */
+  3, /* store_elt_extra_cost  */
+  2, /* vec_to_scalar_cost  */
+  5, /* scalar_to_vec_cost  */
+  8, /* align_load_cost  */
+  8, /* unalign_load_cost  */
+  1, /* unalign_store_cost  */
+  1  /* store_cost  */
+};
+
+static const sve_vec_cost hip12_sve_vector_cost =
+{
+  {
+    2, /* int_stmt_cost  */
+    2, /* fp_stmt_cost  */
+    2, /* ld2_st2_permute_cost  */
+    3, /* ld3_st3_permute_cost  */
+    3, /* ld4_st4_permute_cost  */
+    2, /* permute_cost  */
+    /* Theoretically, a reduction involving 31 scalar ADDs could
+       complete in ~6 cycles and would have a cost of 31.  [SU]ADDV
+       completes in 13 cycles, so give it a cost of 31 + 7.  */
+    38, /* reduc_i8_cost  */
+    /* Likewise for 15 scalar ADDs (~3 cycles) vs. 10: 15 + 7.  */
+    22, /* reduc_i16_cost  */
+    /* Likewise for 7 scalar ADDs (~2 cycles) vs. 7: 7 + 5.  */
+    12, /* reduc_i32_cost  */
+    /* Likewise for 3 scalar ADDs (~1 cycles) vs. 4: 3 + 3.  */
+    6, /* reduc_i64_cost  */
+    /* Theoretically, a reduction involving 15 scalar FADDs could
+       complete in ~8 cycles and would have a cost of 30.  FADDV
+       completes in 15 cycles, so give it a cost of 30 + 7.  */
+    37, /* reduc_f16_cost  */
+    /* Likewise for 7 scalar FADDs (~4 cycles) vs. 12: 14 + 8.  */
+    22, /* reduc_f32_cost  */
+    /* Likewise for 3 scalar FADDs (~2 cycles) vs. 9: 6 + 7.  */
+    13, /* reduc_f64_cost  */
+    2, /* store_elt_extra_cost  */
+    /* This value is just inherited from the Cortex-A57 table.  */
+    2, /* vec_to_scalar_cost  */
+    /* See the comment above the Advanced SIMD versions.  */
+    5, /* scalar_to_vec_cost  */
+    8, /* align_load_cost  */
+    8, /* unalign_load_cost  */
+    /* Although stores have a latency of 2 and compete for the
+       vector pipes, in practice it's better not to model that.  */
+    1, /* unalign_store_cost  */
+    1  /* store_cost  */
+  },
+  3, /* clast_cost  */
+  42, /* fadda_f16_cost  */
+  26, /* fadda_f32_cost  */
+  20, /* fadda_f64_cost  */
+  6, /* gather_load_x32_cost  */
+  6, /* gather_load_x64_cost  */
+  1 /* scatter_store_elt_cost  */
+};
+
+static const aarch64_scalar_vec_issue_info hip12_scalar_issue_info =
+{
+  5, /* loads_stores_per_cycle  */
+  2, /* stores_per_cycle  */
+  8, /* general_ops_per_cycle  */
+  0, /* fp_simd_load_general_ops  */
+  1 /* fp_simd_store_general_ops  */
+};
+
+static const aarch64_advsimd_vec_issue_info hip12_advsimd_issue_info =
+{
+  {
+    5, /* loads_stores_per_cycle  */
+    2, /* stores_per_cycle  */
+    4, /* general_ops_per_cycle  */
+    0, /* fp_simd_load_general_ops  */
+    1 /* fp_simd_store_general_ops  */
+  },
+  2, /* ld2_st2_general_ops  */
+  2, /* ld3_st3_general_ops  */
+  3 /* ld4_st4_general_ops  */
+};
+
+static const aarch64_sve_vec_issue_info hip12_sve_issue_info =
+{
+  {
+    {
+      5, /* loads_per_cycle  */
+      2, /* stores_per_cycle  */
+      4, /* general_ops_per_cycle  */
+      0, /* fp_simd_load_general_ops  */
+      1 /* fp_simd_store_general_ops  */
+    },
+    2, /* ld2_st2_general_ops  */
+    2, /* ld3_st3_general_ops  */
+    3 /* ld4_st4_general_ops  */
+  },
+  2, /* pred_ops_per_cycle  */
+  1, /* while_pred_ops  */
+  0, /* int_cmp_pred_ops  */
+  0, /* fp_cmp_pred_ops  */
+  1, /* gather_scatter_pair_general_ops  */
+  1 /* gather_scatter_pair_pred_ops  */
+};
+
+static const aarch64_vec_issue_info hip12_vec_issue_info =
+{
+  &hip12_scalar_issue_info,
+  &hip12_advsimd_issue_info,
+  &hip12_sve_issue_info
+};
+
+static const struct cpu_vector_cost hip12_vector_cost =
+{
+  1, /* scalar_int_stmt_cost  */
+  2, /* scalar_fp_stmt_cost  */
+  4, /* scalar_load_cost  */
+  1, /* scalar_store_cost  */
+  1, /* cond_taken_branch_cost  */
+  1, /* cond_not_taken_branch_cost  */
+  &hip12_advsimd_vector_cost, /* advsimd  */
+  &hip12_sve_vector_cost, /* sve  */
+  &hip12_vec_issue_info /* issue_info  */
+};
+
 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
 {
   2, /* int_stmt_cost  */
@@ -1622,6 +1787,17 @@ static const cpu_prefetch_tune hip11_prefetch_tune =
   -1                    /* default_opt_level  */
 };
 
+static const cpu_prefetch_tune hip12_prefetch_tune =
+{
+  0,                    /* num_slots  */
+  64,                   /* l1_cache_size  */
+  64,                   /* l1_cache_line_size  */
+  512,                  /* l2_cache_size  */
+  true,                 /* prefetch_dynamic_strides */
+  -1,                   /* minimum_stride */
+  -1                    /* default_opt_level  */
+};
+
 static const cpu_prefetch_tune xgene1_prefetch_tune =
 {
   8,			/* num_slots  */
@@ -2121,6 +2297,39 @@ static const struct tune_params hip11_tunings =
   &hip11_prefetch_tune
 };
 
+static const struct tune_params hip12_tunings =
+{
+  &hip12_extra_costs,
+  &hip12_addrcost_table,
+  &hip12_regmove_cost,
+  &hip12_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_256, /* sve_width  */
+  { 4, /* load_int.  */
+    1, /* store_int.  */
+    6, /* load_fp.  */
+    1, /* store_fp.  */
+    6, /* load_pred.  */
+    1 /* store_pred.  */
+  }, /* memmov_cost.  */
+  16,    /* issue_rate  */
+  (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16", /* function_align.  */
+  "4",  /* jump_align.  */
+  "8",  /* loop_align.  */
+  4,    /* int_reassoc_width.  */
+  4,    /* fp_reassoc_width.  */
+  4,    /* vec_reassoc_width.  */
+  2,    /* min_div_recip_mul_sf.  */
+  2,    /* min_div_recip_mul_df.  */
+  0,    /* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
+  &generic_prefetch_tune
+};
+
 static const struct tune_params xgene1_tunings =
 {
   &xgene1_extra_costs,
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 90073ac9832..e985e6c2cf1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -19221,7 +19221,8 @@ performance of the code.  Permissible values for this option are:
 @samp{octeontx2}, @samp{octeontx2t98}, @samp{octeontx2t96}
 @samp{octeontx2t93}, @samp{octeontx2f95}, @samp{octeontx2f95n},
 @samp{octeontx2f95mm},
-@samp{a64fx},@samp{hip09},@samp{hip10a},@samp{hip10c},@samp{hip11}
+@samp{a64fx},
+@samp{hip09},@samp{hip10a},@samp{hip10c},@samp{hip11},@samp{hip12},
 @samp{thunderx}, @samp{thunderxt88},
 @samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110},
 @samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t110}, @samp{zeus},
-- 
Gitee