diff --git a/src/cmd/compile/internal/arm64/ssa.go b/src/cmd/compile/internal/arm64/ssa.go index adcabb1b954aeb3668d818b9ce3365fdb8a7526d..995b12c27572b5a87528fa046c81cb93c26cecce 100644 --- a/src/cmd/compile/internal/arm64/ssa.go +++ b/src/cmd/compile/internal/arm64/ssa.go @@ -1255,6 +1255,33 @@ func ssaGenValue(s *ssagen.State, v *ssa.Value) { p.From.Offset = int64(x) p.To.Type = obj.TYPE_REG p.To.Reg = v.Reg() + case ssa.OpARM64VDUP16B: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + case ssa.OpARM64VMOVXD: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = v.Args[0].Reg() + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ELEM + (arm64.ARNG_D&15)<<5 + p.To.Index = int16(v.AuxInt) + case ssa.OpARM64VMOVDX: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ELEM + (arm64.ARNG_D&15)<<5 + p.From.Index = int16(v.AuxInt) + p.To.Type = obj.TYPE_REG + p.To.Reg = v.Reg() + case ssa.OpARM64VCMEQ16B: + p := s.Prog(v.Op.Asm()) + p.From.Type = obj.TYPE_REG + p.From.Reg = (v.Args[0].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + p.Reg = (v.Args[1].Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 + p.To.Type = obj.TYPE_REG + p.To.Reg = (v.Reg()-arm64.REG_F0)&31 + arm64.REG_ARNG + (arm64.ARNG_16B&15)<<5 default: v.Fatalf("genValue not implemented: %s", v.LongString()) } diff --git a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go index c9cb62cd17cee2d42ee149c96fa221f03d73af2b..7aae75a615d9e5715e7660628f6a688acb9999fe 100644 --- a/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go +++ b/src/cmd/compile/internal/ssa/_gen/ARM64Ops.go @@ -749,6 +749,12 @@ func init() { {name: "LoweredPanicBoundsB", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r1, r2}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). {name: "LoweredPanicBoundsC", argLength: 3, aux: "Int64", reg: regInfo{inputs: []regMask{r0, r1}}, typ: "Mem", call: true}, // arg0=idx, arg1=len, arg2=mem, returns memory. AuxInt contains report code (see PanicBounds in generic.go). + // NEON + {name: "VDUP16B", argLength: 1, reg: gpfp, asm: "VDUP"}, // vdup Rn, Vd.<16B> + {name: "VMOVXD", argLength: 1, reg: gpfp, asm: "VMOV", aux: "Int64"}, // vmov X, V.<2D>[index] + {name: "VMOVDX", argLength: 1, reg: fpgp, asm: "VMOV", aux: "Int64"}, // vmov V.<2D>[index], X + {name: "VCMEQ16B", argLength: 2, reg: fp21, asm: "VCMEQ"}, // vcmeq Vm.<16B>, Vn.<16B>, Vd.<16B> + // Prefetch instruction // Do prefetch arg0 address with option aux. arg0=addr, arg1=memory, aux=option. {name: "PRFM", argLength: 2, aux: "Int64", reg: prefreg, asm: "PRFM", hasSideEffects: true}, diff --git a/src/cmd/compile/internal/ssa/opGen.go b/src/cmd/compile/internal/ssa/opGen.go index df1ddfa69edfc101c757899c8f03f837bcd34062..4cc3ff32e5ed66fcf4fe50fd1710eb5b18eb4ab1 100644 --- a/src/cmd/compile/internal/ssa/opGen.go +++ b/src/cmd/compile/internal/ssa/opGen.go @@ -1760,6 +1760,10 @@ const ( OpARM64LoweredPanicBoundsA OpARM64LoweredPanicBoundsB OpARM64LoweredPanicBoundsC + OpARM64VDUP16B + OpARM64VMOVXD + OpARM64VMOVDX + OpARM64VCMEQ16B OpARM64PRFM OpARM64DMB @@ -23650,6 +23654,61 @@ var opcodeTable = [...]opInfo{ }, }, }, + { + name: "VDUP16B", + argLen: 1, + asm: arm64.AVDUP, + reg: regInfo{ + inputs: []inputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "VMOVXD", + auxType: auxInt64, + argLen: 1, + asm: arm64.AVMOV, + reg: regInfo{ + inputs: []inputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, + { + name: "VMOVDX", + auxType: auxInt64, + argLen: 1, + asm: arm64.AVMOV, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 670826495}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 R30 + }, + }, + }, + { + name: "VCMEQ16B", + argLen: 2, + asm: arm64.AVCMEQ, + reg: regInfo{ + inputs: []inputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + {1, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + outputs: []outputInfo{ + {0, 9223372034707292160}, // F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 F10 F11 F12 F13 F14 F15 F16 F17 F18 F19 F20 F21 F22 F23 F24 F25 F26 F27 F28 F29 F30 F31 + }, + }, + }, { name: "PRFM", auxType: auxInt64, diff --git a/src/cmd/compile/internal/ssagen/intrinsics.go b/src/cmd/compile/internal/ssagen/intrinsics.go index 45a27428b856dfee8bbf0870f527f247a7ec4325..dfa8da3c29138f26b9c24f8fe3e146504048bddf 100644 --- a/src/cmd/compile/internal/ssagen/intrinsics.go +++ b/src/cmd/compile/internal/ssagen/intrinsics.go @@ -8,6 +8,7 @@ import ( "fmt" "internal/abi" "internal/buildcfg" + "internal/goexperiment" "cmd/compile/internal/base" "cmd/compile/internal/ir" @@ -1553,6 +1554,97 @@ func initIntrinsics(cfg *intrinsicBuildConfig) { return s.newValue1(ssa.OpZeroExt8to64, types.Types[types.TUINT64], out) }, sys.AMD64) + + if goexperiment.ARM64SwissSIMD { + addF("internal/runtime/maps", "bitsetRemoveBelow", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + i := args[1] + + // Clear the lower i bytes in b. + // + // out = b & (-1 << (i << 3)) + + mone := s.constInt64(types.Types[types.TUINT64], -1) + + ii := s.newValue1I(ssa.OpARM64SLLconst, types.Types[types.TUINT64], 3, i) + mask := s.newValue2(ssa.OpARM64SLL, types.Types[types.TUINT64], mone, ii) + + return s.newValue2(ssa.OpARM64AND, types.Types[types.TUINT64], b, mask) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetLowestSet", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Test the sign of the lowest byte in b. + // + // out = (b & 0x80) == 0x80 + + and := s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], 0x80, b) + eq := s.newValue1I(ssa.OpARM64CMPconst, types.TypeFlags, 0x80, and) + return s.newValue1(ssa.OpARM64Equal, types.Types[types.TBOOL], eq) + }, + sys.ARM64) + addF("internal/runtime/maps", "bitsetShiftOutLowest", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + b := args[0] + + // Right shift out the lowest bit in b. + // + // out = b >> 8 + + return s.newValue1I(ssa.OpARM64SRLconst, types.Types[types.TUINT64], 8, b) + }, + sys.ARM64) + addF("internal/runtime/maps", "ctrlGroupMatchH2", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + g := args[0] + h := args[1] + + // Copy g to gfp.2D[0], broadcast h to hfp. + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, g) + hfp := s.newValue1(ssa.OpARM64VDUP16B, types.Types[types.TUINT64], h) + + // Compare each byte of the control word with h2. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.Types[types.TUINT64], hfp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + mask := uint64(0x8080808080808080) + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + + addF("internal/runtime/maps", "ctrlGroupMatchEmpty", + func(s *state, n *ir.CallExpr, args []*ssa.Value) *ssa.Value { + // An empty slot is 1000 0000 + // A deleted slot is 1111 1110 + // A full slot is 0??? ???? + + mask := uint64(0x8080808080808080) + g := args[0] + e := s.constInt64(types.Types[types.TUINT64], int64(mask)) + + // Copy g to gfp.2D[0], e to efp.2D[0] + gfp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, g) + efp := s.newValue1I(ssa.OpARM64VMOVXD, types.Types[types.TUINT64], 0, e) + + // Compare each byte of the control word with ctrlEmpty. Each + // matching byte has every bit set. + eq := s.newValue2(ssa.OpARM64VCMEQ16B, types.Types[types.TUINT64], efp, gfp) + + // Copy eq.2D[0] to eqgp + eqgp := s.newValue1I(ssa.OpARM64VMOVDX, types.Types[types.TUINT64], 0, eq) + + // Set all bits except sign bits to 0 + return s.newValue1I(ssa.OpARM64ANDconst, types.Types[types.TUINT64], int64(mask), eqgp) + }, + sys.ARM64) + } } // findIntrinsic returns a function which builds the SSA equivalent of the diff --git a/src/cmd/internal/obj/arm64/asm7.go b/src/cmd/internal/obj/arm64/asm7.go index f8d4c7aa98a4be93e99f83208d31f2b27b420976..4d536dde79b5fa53609c86b19bb5243fe511b7a2 100644 --- a/src/cmd/internal/obj/arm64/asm7.go +++ b/src/cmd/internal/obj/arm64/asm7.go @@ -894,6 +894,8 @@ var optab = []Optab{ {obj.ADUFFCOPY, C_NONE, C_NONE, C_NONE, C_SBRA, C_NONE, 5, 4, 0, 0, 0}, // same as AB/ABL {obj.APCALIGN, C_LCON, C_NONE, C_NONE, C_NONE, C_NONE, 0, 0, 0, 0, 0}, // align code {obj.APCALIGNMAX, C_LCON, C_NONE, C_NONE, C_LCON, C_NONE, 0, 0, 0, 0, 0}, // align code, conditional + + {AVCMEQ, C_ARNG, C_ARNG, C_NONE, C_ARNG, C_NONE, 72, 4, 0, 0, 0}, } // Valid pstate field values, and value to use in instruction. @@ -3336,7 +3338,8 @@ func buildop(ctxt *obj.Link) { AVMOVI, APRFM, AVEXT, - AVXAR: + AVXAR, + AVCMEQ: break case obj.ANOP, diff --git a/src/internal/goexperiment/arm64swisssimd_off.go b/src/internal/goexperiment/arm64swisssimd_off.go new file mode 100644 index 0000000000000000000000000000000000000000..5f609d6af7675a22c580e0c4b885e90d85ff7793 --- /dev/null +++ b/src/internal/goexperiment/arm64swisssimd_off.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build !goexperiment.arm64swisssimd + +package goexperiment + +const ARM64SwissSIMD = false +const ARM64SwissSIMDInt = 0 diff --git a/src/internal/goexperiment/arm64swisssimd_on.go b/src/internal/goexperiment/arm64swisssimd_on.go new file mode 100644 index 0000000000000000000000000000000000000000..526ca2b06e24c1412c67d94a8830136899629cac --- /dev/null +++ b/src/internal/goexperiment/arm64swisssimd_on.go @@ -0,0 +1,8 @@ +// Code generated by mkconsts.go. DO NOT EDIT. + +//go:build goexperiment.arm64swisssimd + +package goexperiment + +const ARM64SwissSIMD = true +const ARM64SwissSIMDInt = 1 diff --git a/src/internal/goexperiment/flags.go b/src/internal/goexperiment/flags.go index ac85fc800092a40a2090e8f719e5c012b54a682f..ece6d9003b5d04fd89c963b19cef1e698aec1361 100644 --- a/src/internal/goexperiment/flags.go +++ b/src/internal/goexperiment/flags.go @@ -131,4 +131,7 @@ type Flags struct { // Kunpeng malloc prefetch optimization. PrefetchMalloc bool + + // ARM64SwissSIMD enables SIMD intrinsics for swiss table on ARM64 platform. + ARM64SwissSIMD bool }