From 144764ad771fb050ae9ad2304ddcd560d80090b8 Mon Sep 17 00:00:00 2001 From: shupiaoyang Date: Tue, 19 Aug 2025 16:14:23 +0800 Subject: [PATCH 1/4] Add ARM64 NEON instructions and intrinsics for hashmap bitset operations --- src/cmd/compile/internal/arm64/ssa.go | 27 ++++++ src/cmd/compile/internal/ssa/_gen/ARM64Ops.go | 6 ++ src/cmd/compile/internal/ssa/opGen.go | 59 ++++++++++++ src/cmd/compile/internal/ssagen/intrinsics.go | 92 +++++++++++++++++++ src/cmd/internal/obj/arm64/asm7.go | 5 +- 5 files changed, 188 insertions(+), 1 deletion(-) diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index adcabb1b..995b12c2 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -1255,6 +1255,33 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Offset = int64(x) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpARM64VDUP16B: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + case ssa.OpARM64VMOVXD: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ELEM + (arm64.ARNG_D&15)<<5 + p.To.Index = int16(v.AuxInt) + case ssa.OpARM64VMOVDX: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ELEM + (arm64.ARNG_D&15)<<5 + p.From.Index = int16(v.AuxInt) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpARM64VCMEQ16B: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + p.Reg = (v.Args[1].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 default: v.Fatalf("genValue not implemented: %s", v.LongString()) } diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go index c9cb62cd..7aae75a6 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -749,6 +749,12 @@ func init() { {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + // NEON + {name: "VDUP16B", argLength: 1, reg: gpfp, asm: "VDUP"}, // vdup Rn, Vd.<16B> + {name: "VMOVXD", argLength: 1, reg: gpfp, asm: "VMOV", aux: "Int64"}, // vmov X, V.<2D>[index] + {name: "VMOVDX", argLength: 1, reg: fpgp, asm: "VMOV", aux: "Int64"}, // vmov V.<2D>[index], X + {name: "VCMEQ16B", argLength: 2, reg: fp21, asm: "VCMEQ"}, // vcmeq Vm.<16B>, Vn.<16B>, Vd.<16B> + // Prefetch instruction // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option. {name: "PRFM", argLength: 2, aux: "Int64", reg: prefreg, asm: "PRFM", hasSideEffects: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index df1ddfa6..4cc3ff32 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1760,6 +1760,10 @@ const ( OpARM64LoweredPanicBoundsA OpARM64LoweredPanicBoundsB OpARM64LoweredPanicBoundsC + OpARM64VDUP16B + OpARM64VMOVXD + OpARM64VMOVDX + OpARM64VCMEQ16B OpARM64PRFM OpARM64DMB @@ -23650,6 +23654,61 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VDUP16B", + argLen: 1, + asm: arm64.AVDUP, + reg: regInfo{ + inputs: []inputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "VMOVXD", + auxType: auxInt64, + argLen: 1, + asm: arm64.AVMOV, + reg: regInfo{ + inputs: []inputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "VMOVDX", + auxType: auxInt64, + argLen: 1, + asm: arm64.AVMOV, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + }, + }, + { + name: "VCMEQ16B", + argLen: 2, + asm: arm64.AVCMEQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "PRFM", auxType: auxInt64, diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index e4da86db..446f28f9 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1297,6 +1297,24 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) + addF("internal/runtime/maps", "bitsetRemoveBelow", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + i := args[1] + + // Clear the lower i bytes in b. + // + // out = b & (-1 << (i << 3)) + + mone := s.constInt64(types.Types[types.TUINT64], -1) + + ii := s.newValue1I(ssa.OpARM64SLLconst, types.Types[types.TUINT64], 3, i) + mask := s.newValue2(ssa.OpARM64SLL, types.Types[types.TUINT64], mone, ii) + + return s.newValue2(ssa.OpARM64AND, types.Types[types.TUINT64], b, mask) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetLowestSet", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] @@ -1311,6 +1329,20 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) + addF("internal/runtime/maps", "bitsetLowestSet", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Test the sign of the lowest byte in b. + // + // out = (b & 0x80) == 0x80 + + and := s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], 0x80, b) + eq := s.newValue1I(ssa.OpARM64CMPconst, types.TypeFlags, 0x80, and) + return s.newValue1(ssa.OpARM64Equal, types.Types[types.TBOOL], eq) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetShiftOutLowest", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] @@ -1324,6 +1356,18 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) + addF("internal/runtime/maps", "bitsetShiftOutLowest", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Right shift out the lowest bit in b. + // + // out = b >> 8 + + return s.newValue1I(ssa.OpARM64SRLconst, types.Types[types.TUINT64], 8, b) + }, + sys.ARM64) + addF("internal/runtime/maps", "ctrlGroupMatchH2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { g := args[0] @@ -1396,6 +1440,28 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) + addF("internal/runtime/maps", "ctrlGroupMatchH2", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + g := args[0] + h := args[1] + + // Copy g to gfp.2D[0], broadcast h to hfp. + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) + hfp := s.newValue1(ssa.OpARM64VDUP16B, types.TypeInt128, h) + + // Compare each byte of the control word with h2. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, hfp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + mask := uint64(0x8080808080808080) + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + addF("internal/runtime/maps", "ctrlGroupMatchEmpty", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 @@ -1485,6 +1551,32 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) + addF("internal/runtime/maps", "ctrlGroupMatchEmpty", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + // An empty slot is 1000 0000 + // A deleted slot is 1111 1110 + // A full slot is 0??? ???? + + mask := uint64(0x8080808080808080) + g := args[0] + e := s.constInt64(types.Types[types.TUINT64], int64(mask)) + + // Copy g to gfp.2D[0], e to efp.2D[0] + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) + efp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, e) + + // Compare each byte of the control word with ctrlEmpty. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, efp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index f8d4c7aa..4d536dde 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -894,6 +894,8 @@ var optab = []Optab{ {obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL {obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // align code {obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional + + {AVCMEQ, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 72, 4, 0, 0, 0}, } // Valid pstate field values, and value to use in instruction. @@ -3336,7 +3338,8 @@ func buildop(ctxt *obj.Link) { AVMOVI, APRFM, AVEXT, - AVXAR: + AVXAR, + AVCMEQ: break case obj.ANOP, -- Gitee From e7c95c40f466779efe6d41f9187dd19d9319cc1f Mon Sep 17 00:00:00 2001 From: shupiaoyang Date: Wed, 15 Oct 2025 10:52:07 +0800 Subject: [PATCH 2/4] Add ARM64SwissSIMD experiment flag and conditionally enable SIMD intrinsics for swiss table on ARM64 --- src/cmd/compile/internal/ssagen/intrinsics.go | 184 +++++++++--------- .../goexperiment/arm64swisssimd_off.go | 8 + .../goexperiment/arm64swisssimd_on.go | 8 + src/internal/goexperiment/flags.go | 3 + 4 files changed, 111 insertions(+), 92 deletions(-) create mode 100644 src/internal/goexperiment/arm64swisssimd_off.go create mode 100644 src/internal/goexperiment/arm64swisssimd_on.go diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 446f28f9..cd2213c2 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -8,6 +8,7 @@ import ( "fmt" "internal/abi" "internal/buildcfg" + "std/internal/goexperiment" "cmd/compile/internal/base" "cmd/compile/internal/ir" @@ -1297,24 +1298,6 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) - addF("internal/runtime/maps", "bitsetRemoveBelow", - func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - b := args[0] - i := args[1] - - // Clear the lower i bytes in b. - // - // out = b & (-1 << (i << 3)) - - mone := s.constInt64(types.Types[types.TUINT64], -1) - - ii := s.newValue1I(ssa.OpARM64SLLconst, types.Types[types.TUINT64], 3, i) - mask := s.newValue2(ssa.OpARM64SLL, types.Types[types.TUINT64], mone, ii) - - return s.newValue2(ssa.OpARM64AND, types.Types[types.TUINT64], b, mask) - }, - sys.ARM64) - addF("internal/runtime/maps", "bitsetLowestSet", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] @@ -1329,20 +1312,6 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) - addF("internal/runtime/maps", "bitsetLowestSet", - func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - b := args[0] - - // Test the sign of the lowest byte in b. - // - // out = (b & 0x80) == 0x80 - - and := s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], 0x80, b) - eq := s.newValue1I(ssa.OpARM64CMPconst, types.TypeFlags, 0x80, and) - return s.newValue1(ssa.OpARM64Equal, types.Types[types.TBOOL], eq) - }, - sys.ARM64) - addF("internal/runtime/maps", "bitsetShiftOutLowest", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { b := args[0] @@ -1356,18 +1325,6 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) - addF("internal/runtime/maps", "bitsetShiftOutLowest", - func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - b := args[0] - - // Right shift out the lowest bit in b. - // - // out = b >> 8 - - return s.newValue1I(ssa.OpARM64SRLconst, types.Types[types.TUINT64], 8, b) - }, - sys.ARM64) - addF("internal/runtime/maps", "ctrlGroupMatchH2", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { g := args[0] @@ -1440,28 +1397,6 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) - addF("internal/runtime/maps", "ctrlGroupMatchH2", - func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - g := args[0] - h := args[1] - - // Copy g to gfp.2D[0], broadcast h to hfp. - gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) - hfp := s.newValue1(ssa.OpARM64VDUP16B, types.TypeInt128, h) - - // Compare each byte of the control word with h2. Each - // matching byte has every bit set. - eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, hfp, gfp) - - // Copy eq.2D[0] to eqgp - eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) - - // Set all bits except sign bits to 0 - mask := uint64(0x8080808080808080) - return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) - }, - sys.ARM64) - addF("internal/runtime/maps", "ctrlGroupMatchEmpty", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 @@ -1551,32 +1486,6 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { }, sys.AMD64) - addF("internal/runtime/maps", "ctrlGroupMatchEmpty", - func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { - // An empty slot is 1000 0000 - // A deleted slot is 1111 1110 - // A full slot is 0??? ???? - - mask := uint64(0x8080808080808080) - g := args[0] - e := s.constInt64(types.Types[types.TUINT64], int64(mask)) - - // Copy g to gfp.2D[0], e to efp.2D[0] - gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) - efp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, e) - - // Compare each byte of the control word with ctrlEmpty. Each - // matching byte has every bit set. - eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, efp, gfp) - - // Copy eq.2D[0] to eqgp - eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) - - // Set all bits except sign bits to 0 - return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) - }, - sys.ARM64) - addF("internal/runtime/maps", "ctrlGroupMatchEmptyOrDeleted", func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { // An empty slot is 1000 0000 @@ -1643,6 +1552,97 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out) }, sys.AMD64) + + if goexperiment.ARM64SwissSIMD { + addF("internal/runtime/maps", "bitsetRemoveBelow", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + i := args[1] + + // Clear the lower i bytes in b. + // + // out = b & (-1 << (i << 3)) + + mone := s.constInt64(types.Types[types.TUINT64], -1) + + ii := s.newValue1I(ssa.OpARM64SLLconst, types.Types[types.TUINT64], 3, i) + mask := s.newValue2(ssa.OpARM64SLL, types.Types[types.TUINT64], mone, ii) + + return s.newValue2(ssa.OpARM64AND, types.Types[types.TUINT64], b, mask) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetLowestSet", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Test the sign of the lowest byte in b. + // + // out = (b & 0x80) == 0x80 + + and := s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], 0x80, b) + eq := s.newValue1I(ssa.OpARM64CMPconst, types.TypeFlags, 0x80, and) + return s.newValue1(ssa.OpARM64Equal, types.Types[types.TBOOL], eq) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetShiftOutLowest", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Right shift out the lowest bit in b. + // + // out = b >> 8 + + return s.newValue1I(ssa.OpARM64SRLconst, types.Types[types.TUINT64], 8, b) + }, + sys.ARM64) + addF("internal/runtime/maps", "ctrlGroupMatchH2", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + g := args[0] + h := args[1] + + // Copy g to gfp.2D[0], broadcast h to hfp. + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) + hfp := s.newValue1(ssa.OpARM64VDUP16B, types.TypeInt128, h) + + // Compare each byte of the control word with h2. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, hfp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + mask := uint64(0x8080808080808080) + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + + addF("internal/runtime/maps", "ctrlGroupMatchEmpty", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + // An empty slot is 1000 0000 + // A deleted slot is 1111 1110 + // A full slot is 0??? ???? + + mask := uint64(0x8080808080808080) + g := args[0] + e := s.constInt64(types.Types[types.TUINT64], int64(mask)) + + // Copy g to gfp.2D[0], e to efp.2D[0] + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) + efp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, e) + + // Compare each byte of the control word with ctrlEmpty. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, efp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + } } // findIntrinsic returns a function which builds the SSA equivalent of the diff --git a/src/internal/goexperiment/arm64swisssimd_off.go b/src/internal/goexperiment/arm64swisssimd_off.go new file mode 100644 index 00000000..5f609d6a --- /dev/null +++ b/src/internal/goexperiment/arm64swisssimd_off.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build !goexperiment.arm64swisssimd + +package goexperiment + +const ARM64SwissSIMD = false +const ARM64SwissSIMDInt = 0 diff --git a/src/internal/goexperiment/arm64swisssimd_on.go b/src/internal/goexperiment/arm64swisssimd_on.go new file mode 100644 index 00000000..526ca2b0 --- /dev/null +++ b/src/internal/goexperiment/arm64swisssimd_on.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build goexperiment.arm64swisssimd + +package goexperiment + +const ARM64SwissSIMD = true +const ARM64SwissSIMDInt = 1 diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go index 948ed5c8..367cf541 100644 --- a/src/internal/goexperiment/flags.go +++ b/src/internal/goexperiment/flags.go @@ -128,4 +128,7 @@ type Flags struct { // Synctest enables the testing/synctest package. Synctest bool + + // ARM64SwissSIMD enables SIMD intrinsics for swiss table on ARM64 platform. + ARM64SwissSIMD bool } -- Gitee From fade4ebba3ab19a723205313091b1a17ecf7df8f Mon Sep 17 00:00:00 2001 From: shupiaoyang Date: Wed, 15 Oct 2025 10:52:45 +0800 Subject: [PATCH 3/4] Add ARM64SwissSIMD experiment flag and conditionally enable SIMD intrinsics for swiss table on ARM64 --- src/cmd/compile/internal/ssagen/intrinsics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index cd2213c2..b3d91a5e 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -8,7 +8,7 @@ import ( "fmt" "internal/abi" "internal/buildcfg" - "std/internal/goexperiment" + "internal/goexperiment" "cmd/compile/internal/base" "cmd/compile/internal/ir" -- Gitee From 7966db9a6a850779bba65a686271f79082140164 Mon Sep 17 00:00:00 2001 From: shupiaoyang Date: Wed, 15 Oct 2025 12:06:44 +0800 Subject: [PATCH 4/4] Update ARM64 intrinsic types to use TUINT64 --- src/cmd/compile/internal/ssagen/intrinsics.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index b3d91a5e..bd2fa149 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -1601,12 +1601,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { h := args[1] // Copy g to gfp.2D[0], broadcast h to hfp. - gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) - hfp := s.newValue1(ssa.OpARM64VDUP16B, types.TypeInt128, h) + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, g) + hfp := s.newValue1(ssa.OpARM64VDUP16B, types.Types[types.TUINT64], h) // Compare each byte of the control word with h2. Each // matching byte has every bit set. - eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, hfp, gfp) + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.Types[types.TUINT64], hfp, gfp) // Copy eq.2D[0] to eqgp eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) @@ -1628,12 +1628,12 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { e := s.constInt64(types.Types[types.TUINT64], int64(mask)) // Copy g to gfp.2D[0], e to efp.2D[0] - gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, g) - efp := s.newValue1I(ssa.OpARM64VMOVXD, types.TypeInt128, 0, e) + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, g) + efp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, e) // Compare each byte of the control word with ctrlEmpty. Each // matching byte has every bit set. - eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.TypeInt128, efp, gfp) + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.Types[types.TUINT64], efp, gfp) // Copy eq.2D[0] to eqgp eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) -- Gitee