2 Star 2 Fork 8

王布衣/gox

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
matrix.go 24.55 KB
一键复制 编辑 原始数据 按行查看 历史
王布衣 提交于 2023-06-03 06:07 . 调整vek目录为num
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068
package main
import (
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
)
func genMat4Mul_F64() {
TEXT("Mat4Mul_AVX2_F64", NOSPLIT, "func(x, y, z []float64)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
VBROADCASTSD(Mem{Base: RSI}, Y0)
VMOVUPD(Mem{Base: RDX}, Y1)
VMOVUPD(Mem{Base: RDX}.Offset(32), Y2)
VMOVUPD(Mem{Base: RDX}.Offset(64), Y3)
VMOVUPD(Mem{Base: RDX}.Offset(96), Y4)
VMULPD(Y0, Y1, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(8), Y5)
VFMADD213PD(Y0, Y2, Y5)
VBROADCASTSD(Mem{Base: RSI}.Offset(16), Y0)
VFMADD213PD(Y5, Y3, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(24), Y5)
VFMADD213PD(Y0, Y4, Y5)
VMOVUPD(Y5, Mem{Base: RDI})
VBROADCASTSD(Mem{Base: RSI}.Offset(32), Y0)
VMULPD(Y0, Y1, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(40), Y1)
VFMADD213PD(Y0, Y2, Y1)
VBROADCASTSD(Mem{Base: RSI}.Offset(48), Y0)
VFMADD213PD(Y1, Y3, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(56), Y1)
VFMADD213PD(Y0, Y4, Y1)
VMOVUPD(Y1, Mem{Base: RDI}.Offset(32))
VBROADCASTSD(Mem{Base: RSI}.Offset(64), Y0)
VMOVUPD(Mem{Base: RDX}, Y1)
VMOVUPD(Mem{Base: RDX}.Offset(32), Y2)
VMOVUPD(Mem{Base: RDX}.Offset(64), Y3)
VMOVUPD(Mem{Base: RDX}.Offset(96), Y4)
VMULPD(Y0, Y1, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(72), Y5)
VFMADD213PD(Y0, Y2, Y5)
VBROADCASTSD(Mem{Base: RSI}.Offset(80), Y0)
VFMADD213PD(Y5, Y3, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(88), Y5)
VFMADD213PD(Y0, Y4, Y5)
VMOVUPD(Y5, Mem{Base: RDI}.Offset(64))
VBROADCASTSD(Mem{Base: RSI}.Offset(96), Y0)
VMULPD(Y0, Y1, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(104), Y1)
VFMADD213PD(Y0, Y2, Y1)
VBROADCASTSD(Mem{Base: RSI}.Offset(112), Y0)
VFMADD213PD(Y1, Y3, Y0)
VBROADCASTSD(Mem{Base: RSI}.Offset(120), Y1)
VFMADD213PD(Y0, Y4, Y1)
VMOVUPD(Y1, Mem{Base: RDI}.Offset(96))
VZEROUPPER()
RET()
}
func genMat4Mul_F32() {
TEXT("Mat4Mul_AVX2_F32", NOSPLIT, "func(x, y, z []float32)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
VBROADCASTF128(Mem{Base: RDX}, Y0)
VBROADCASTF128(Mem{Base: RDX}.Offset(16), Y1)
VBROADCASTF128(Mem{Base: RDX}.Offset(32), Y2)
VBROADCASTF128(Mem{Base: RDX}.Offset(48), Y3)
VMOVSS(Mem{Base: RSI}.Offset(16), X4)
VMOVSS(Mem{Base: RSI}, X5)
VSHUFPS(Imm(0), X4, X5, X4)
VMOVSS(Mem{Base: RSI}.Offset(4), X5)
VMOVSS(Mem{Base: RSI}.Offset(8), X6)
VMOVSS(Mem{Base: RSI}.Offset(12), X7)
VPERMPD(Imm(80), Y4, Y4)
VMULPS(Y4, Y0, Y0)
VMOVSS(Mem{Base: RSI}.Offset(20), X4)
VSHUFPS(Imm(0), X4, X5, X4)
VPERMPD(Imm(80), Y4, Y4)
VFMADD213PS(Y0, Y1, Y4)
VMOVSS(Mem{Base: RSI}.Offset(24), X0)
VSHUFPS(Imm(0), X0, X6, X0)
VPERMPD(Imm(80), Y0, Y0)
VFMADD213PS(Y4, Y2, Y0)
VMOVSS(Mem{Base: RSI}.Offset(28), X1)
VSHUFPS(Imm(0), X1, X7, X1)
VPERMPD(Imm(80), Y1, Y1)
VFMADD213PS(Y0, Y3, Y1)
VBROADCASTF128(Mem{Base: RDX}, Y0)
VBROADCASTF128(Mem{Base: RDX}.Offset(16), Y2)
VBROADCASTF128(Mem{Base: RDX}.Offset(32), Y3)
VMOVUPS(Y1, Mem{Base: RDI})
VBROADCASTF128(Mem{Base: RDX}.Offset(48), Y1)
VMOVSS(Mem{Base: RSI}.Offset(48), X4)
VMOVSS(Mem{Base: RSI}.Offset(32), X5)
VSHUFPS(Imm(0), X4, X5, X4)
VMOVSS(Mem{Base: RSI}.Offset(36), X5)
VMOVSS(Mem{Base: RSI}.Offset(40), X6)
VMOVSS(Mem{Base: RSI}.Offset(44), X7)
VPERMPD(Imm(80), Y4, Y4)
VMULPS(Y4, Y0, Y0)
VMOVSS(Mem{Base: RSI}.Offset(52), X4)
VSHUFPS(Imm(0), X4, X5, X4)
VPERMPD(Imm(80), Y4, Y4)
VFMADD213PS(Y0, Y2, Y4)
VMOVSS(Mem{Base: RSI}.Offset(56), X0)
VSHUFPS(Imm(0), X0, X6, X0)
VPERMPD(Imm(80), Y0, Y0)
VFMADD213PS(Y4, Y3, Y0)
VMOVSS(Mem{Base: RSI}.Offset(60), X2)
VSHUFPS(Imm(0), X2, X7, X2)
VPERMPD(Imm(80), Y2, Y2)
VFMADD213PS(Y0, Y1, Y2)
VMOVUPS(Y2, Mem{Base: RDI}.Offset(32))
VZEROUPPER()
RET()
}
func genMatMul_F64() {
TEXT("MatMul_AVX2_F64", 0, "func(x, y, z []float64, a, b, c int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
Load(Param("c"), R9)
PUSHQ(RBP)
PUSHQ(R15)
PUSHQ(R14)
PUSHQ(R13)
PUSHQ(R12)
PUSHQ(RBX)
MOVQ(RDX, Mem{Base: RSP}.Offset(-16))
MOVQ(RCX, Mem{Base: RSP}.Offset(-8))
TESTQ(RCX, RCX)
JE(LabelRef("LBB4_13"))
TESTQ(R8, R8)
JE(LabelRef("LBB4_13"))
TESTQ(R9, R9)
JE(LabelRef("LBB4_13"))
MOVQ(R9, R12)
ANDQ(I32(-16), R12)
MOVQ(Mem{Base: RSP}.Offset(-16), RAX)
LEAQ(Mem{Base: RAX}.Offset(96), RCX)
XORQ(R15, R15)
LEAQ(Mem{Base: R15}.Idx(R9, 8), R11)
LEAQ(Mem{Base: RDI}.Offset(96), RBX)
XORL(R14L, R14L)
JMP(LabelRef("LBB4_4"))
Label("LBB4_12")
{
ADDQ(Imm(1), R14)
ADDQ(R11, RBX)
ADDQ(R11, RDI)
CMPQ(R14, Mem{Base: RSP}.Offset(-8))
JE(LabelRef("LBB4_13"))
}
Label("LBB4_4")
{
MOVQ(R14, R15)
IMULQ(R8, R15)
MOVQ(Mem{Base: RSP}.Offset(-16), R13)
MOVQ(RCX, RAX)
XORL(EBP, EBP)
JMP(LabelRef("LBB4_5"))
}
Label("LBB4_11")
{
ADDQ(Imm(1), RBP)
ADDQ(R11, RAX)
ADDQ(R11, R13)
CMPQ(RBP, R8)
JE(LabelRef("LBB4_12"))
}
Label("LBB4_5")
{
LEAQ(Mem{Base: R15}.Idx(RBP, 1), RDX)
VMOVSD(Mem{Base: RSI}.Idx(RDX, 8), X0)
CMPQ(R9, Imm(16))
JAE(LabelRef("LBB4_7"))
XORL(EDX, EDX)
JMP(LabelRef("LBB4_10"))
}
Label("LBB4_7")
{
VBROADCASTSD(X0, Y1)
XORL(R10L, R10L)
}
Label("LBB4_8")
{
VMOVUPD(Mem{Base: RAX}.Idx(R10, 8).Offset(-96), Y2)
VMOVUPD(Mem{Base: RAX}.Idx(R10, 8).Offset(-64), Y3)
VMOVUPD(Mem{Base: RAX}.Idx(R10, 8).Offset(-32), Y4)
VMOVUPD(Mem{Base: RAX}.Idx(R10, 8), Y5)
VFMADD213PD(Mem{Base: RBX}.Idx(R10, 8).Offset(-96), Y1, Y2)
VFMADD213PD(Mem{Base: RBX}.Idx(R10, 8).Offset(-64), Y1, Y3)
VFMADD213PD(Mem{Base: RBX}.Idx(R10, 8).Offset(-32), Y1, Y4)
VFMADD213PD(Mem{Base: RBX}.Idx(R10, 8), Y1, Y5)
VMOVUPD(Y2, Mem{Base: RBX}.Idx(R10, 8).Offset(-96))
VMOVUPD(Y3, Mem{Base: RBX}.Idx(R10, 8).Offset(-64))
VMOVUPD(Y4, Mem{Base: RBX}.Idx(R10, 8).Offset(-32))
VMOVUPD(Y5, Mem{Base: RBX}.Idx(R10, 8))
ADDQ(Imm(16), R10)
CMPQ(R12, R10)
JNE(LabelRef("LBB4_8"))
MOVQ(R12, RDX)
CMPQ(R12, R9)
JE(LabelRef("LBB4_11"))
}
Label("LBB4_10")
{
VMOVSD(Mem{Base: R13}.Idx(RDX, 8), X1)
VFMADD213SD(Mem{Base: RDI}.Idx(RDX, 8), X0, X1)
VMOVSD(X1, Mem{Base: RDI}.Idx(RDX, 8))
ADDQ(Imm(1), RDX)
CMPQ(R9, RDX)
JNE(LabelRef("LBB4_10"))
JMP(LabelRef("LBB4_11"))
}
Label("LBB4_13")
{
POPQ(RBX)
POPQ(R12)
POPQ(R13)
POPQ(R14)
POPQ(R15)
POPQ(RBP)
VZEROUPPER()
RET()
}
}
func genMatMul_F32() {
TEXT("MatMul_AVX2_F32", 0, "func(x, y, z []float32, a, b, c int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
Load(Param("c"), R9)
PUSHQ(RBP)
PUSHQ(R15)
PUSHQ(R14)
PUSHQ(R13)
PUSHQ(R12)
PUSHQ(RBX)
MOVQ(RDX, Mem{Base: RSP}.Offset(-16))
MOVQ(RCX, Mem{Base: RSP}.Offset(-8))
TESTQ(RCX, RCX)
JE(LabelRef("LBB5_13"))
TESTQ(R8, R8)
JE(LabelRef("LBB5_13"))
TESTQ(R9, R9)
JE(LabelRef("LBB5_13"))
MOVQ(R9, R12)
ANDQ(I32(-32), R12)
MOVQ(Mem{Base: RSP}.Offset(-16), RAX)
LEAQ(Mem{Base: RAX}.Offset(96), RCX)
XORQ(R15, R15)
LEAQ(Mem{Base: R15}.Idx(R9, 4), R11)
LEAQ(Mem{Base: RDI}.Offset(96), RBX)
XORL(R14L, R14L)
JMP(LabelRef("LBB5_4"))
Label("LBB5_12")
{
ADDQ(Imm(1), R14)
ADDQ(R11, RBX)
ADDQ(R11, RDI)
CMPQ(R14, Mem{Base: RSP}.Offset(-8))
JE(LabelRef("LBB5_13"))
}
Label("LBB5_4")
{
MOVQ(R14, R15)
IMULQ(R8, R15)
MOVQ(Mem{Base: RSP}.Offset(-16), R13)
MOVQ(RCX, RAX)
XORL(EBP, EBP)
JMP(LabelRef("LBB5_5"))
}
Label("LBB5_11")
{
ADDQ(Imm(1), RBP)
ADDQ(R11, RAX)
ADDQ(R11, R13)
CMPQ(RBP, R8)
JE(LabelRef("LBB5_12"))
}
Label("LBB5_5")
{
LEAQ(Mem{Base: R15}.Idx(RBP, 1), RDX)
VMOVSS(Mem{Base: RSI}.Idx(RDX, 4), X0)
CMPQ(R9, Imm(32))
JAE(LabelRef("LBB5_7"))
XORL(EDX, EDX)
JMP(LabelRef("LBB5_10"))
}
Label("LBB5_7")
{
VBROADCASTSS(X0, Y1)
XORL(R10L, R10L)
}
Label("LBB5_8")
{
VMOVUPS(Mem{Base: RAX}.Idx(R10, 4).Offset(-96), Y2)
VMOVUPS(Mem{Base: RAX}.Idx(R10, 4).Offset(-64), Y3)
VMOVUPS(Mem{Base: RAX}.Idx(R10, 4).Offset(-32), Y4)
VMOVUPS(Mem{Base: RAX}.Idx(R10, 4), Y5)
VFMADD213PS(Mem{Base: RBX}.Idx(R10, 4).Offset(-96), Y1, Y2)
VFMADD213PS(Mem{Base: RBX}.Idx(R10, 4).Offset(-64), Y1, Y3)
VFMADD213PS(Mem{Base: RBX}.Idx(R10, 4).Offset(-32), Y1, Y4)
VFMADD213PS(Mem{Base: RBX}.Idx(R10, 4), Y1, Y5)
VMOVUPS(Y2, Mem{Base: RBX}.Idx(R10, 4).Offset(-96))
VMOVUPS(Y3, Mem{Base: RBX}.Idx(R10, 4).Offset(-64))
VMOVUPS(Y4, Mem{Base: RBX}.Idx(R10, 4).Offset(-32))
VMOVUPS(Y5, Mem{Base: RBX}.Idx(R10, 4))
ADDQ(Imm(32), R10)
CMPQ(R12, R10)
JNE(LabelRef("LBB5_8"))
MOVQ(R12, RDX)
CMPQ(R12, R9)
JE(LabelRef("LBB5_11"))
}
Label("LBB5_10")
{
VMOVSS(Mem{Base: R13}.Idx(RDX, 4), X1)
VFMADD213SS(Mem{Base: RDI}.Idx(RDX, 4), X0, X1)
VMOVSS(X1, Mem{Base: RDI}.Idx(RDX, 4))
ADDQ(Imm(1), RDX)
CMPQ(R9, RDX)
JNE(LabelRef("LBB5_10"))
JMP(LabelRef("LBB5_11"))
}
Label("LBB5_13")
{
POPQ(RBX)
POPQ(R12)
POPQ(R13)
POPQ(R14)
POPQ(R15)
POPQ(RBP)
VZEROUPPER()
RET()
}
}
func genMatMulVec_F64() {
TEXT("MatMulVec_AVX2_F64", 0, "func(x, y, z []float64, a, b int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
PUSHQ(RBX)
TESTQ(RCX, RCX)
JE(LabelRef("LBB6_10"))
TESTQ(R8, R8)
JE(LabelRef("LBB6_10"))
MOVQ(R8, R9)
ANDQ(I32(-16), R9)
LEAQ(Mem{Base: RSI}.Offset(96), RAX)
XORQ(R10, R10)
LEAQ(Mem{Base: R10}.Idx(R8, 8), R10)
XORL(R11L, R11L)
JMP(LabelRef("LBB6_3"))
Label("LBB6_9")
{
VMOVSD(X0, Mem{Base: RDI}.Idx(R11, 8))
ADDQ(Imm(1), R11)
ADDQ(R10, RAX)
ADDQ(R10, RSI)
CMPQ(R11, RCX)
JE(LabelRef("LBB6_10"))
}
Label("LBB6_3")
{
VMOVQ(Mem{Base: RDI}.Idx(R11, 8), X0)
CMPQ(R8, Imm(16))
JAE(LabelRef("LBB6_5"))
XORL(EBX, EBX)
JMP(LabelRef("LBB6_8"))
}
Label("LBB6_5")
{
VMOVQ(X0, X0)
VXORPD(X1, X1, X1)
XORL(EBX, EBX)
VXORPD(X2, X2, X2)
VXORPD(X3, X3, X3)
}
Label("LBB6_6")
{
VMOVUPD(Mem{Base: RDX}.Idx(RBX, 8), Y4)
VMOVUPD(Mem{Base: RDX}.Idx(RBX, 8).Offset(32), Y5)
VMOVUPD(Mem{Base: RDX}.Idx(RBX, 8).Offset(64), Y6)
VMOVUPD(Mem{Base: RDX}.Idx(RBX, 8).Offset(96), Y7)
VFMADD231PD(Mem{Base: RAX}.Idx(RBX, 8).Offset(-96), Y4, Y0)
VFMADD231PD(Mem{Base: RAX}.Idx(RBX, 8).Offset(-64), Y5, Y1)
VFMADD231PD(Mem{Base: RAX}.Idx(RBX, 8).Offset(-32), Y6, Y2)
VFMADD231PD(Mem{Base: RAX}.Idx(RBX, 8), Y7, Y3)
ADDQ(Imm(16), RBX)
CMPQ(R9, RBX)
JNE(LabelRef("LBB6_6"))
VADDPD(Y0, Y1, Y0)
VADDPD(Y0, Y2, Y0)
VADDPD(Y0, Y3, Y0)
VEXTRACTF128(Imm(1), Y0, X1)
VADDPD(X1, X0, X0)
VPERMILPD(Imm(1), X0, X1)
VADDSD(X1, X0, X0)
MOVQ(R9, RBX)
CMPQ(R9, R8)
JE(LabelRef("LBB6_9"))
}
Label("LBB6_8")
{
VMOVSD(Mem{Base: RDX}.Idx(RBX, 8), X1)
VFMADD231SD(Mem{Base: RSI}.Idx(RBX, 8), X1, X0)
ADDQ(Imm(1), RBX)
CMPQ(R8, RBX)
JNE(LabelRef("LBB6_8"))
JMP(LabelRef("LBB6_9"))
}
Label("LBB6_10")
{
POPQ(RBX)
VZEROUPPER()
RET()
}
}
func genMatMulVec_F32() {
TEXT("MatMulVec_AVX2_F32", 0, "func(x, y, z []float32, a, b int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
PUSHQ(RBX)
TESTQ(RCX, RCX)
JE(LabelRef("LBB7_10"))
TESTQ(R8, R8)
JE(LabelRef("LBB7_10"))
MOVQ(R8, R9)
ANDQ(I32(-32), R9)
LEAQ(Mem{Base: RSI}.Offset(96), RAX)
XORQ(R10, R10)
LEAQ(Mem{Base: R10}.Idx(R8, 4), R10)
XORL(R11L, R11L)
VXORPS(X0, X0, X0)
JMP(LabelRef("LBB7_3"))
Label("LBB7_9")
{
VMOVSS(X1, Mem{Base: RDI}.Idx(R11, 4))
ADDQ(Imm(1), R11)
ADDQ(R10, RAX)
ADDQ(R10, RSI)
CMPQ(R11, RCX)
JE(LabelRef("LBB7_10"))
}
Label("LBB7_3")
{
VMOVSS(Mem{Base: RDI}.Idx(R11, 4), X1)
CMPQ(R8, Imm(32))
JAE(LabelRef("LBB7_5"))
XORL(EBX, EBX)
JMP(LabelRef("LBB7_8"))
}
Label("LBB7_5")
{
VBLENDPS(Imm(1), X1, X0, X1)
VXORPS(X2, X2, X2)
XORL(EBX, EBX)
VXORPS(X3, X3, X3)
VXORPS(X4, X4, X4)
}
Label("LBB7_6")
{
VMOVUPS(Mem{Base: RDX}.Idx(RBX, 4), Y5)
VMOVUPS(Mem{Base: RDX}.Idx(RBX, 4).Offset(32), Y6)
VMOVUPS(Mem{Base: RDX}.Idx(RBX, 4).Offset(64), Y7)
VMOVUPS(Mem{Base: RDX}.Idx(RBX, 4).Offset(96), Y8)
VFMADD231PS(Mem{Base: RAX}.Idx(RBX, 4).Offset(-96), Y5, Y1)
VFMADD231PS(Mem{Base: RAX}.Idx(RBX, 4).Offset(-64), Y6, Y2)
VFMADD231PS(Mem{Base: RAX}.Idx(RBX, 4).Offset(-32), Y7, Y3)
VFMADD231PS(Mem{Base: RAX}.Idx(RBX, 4), Y8, Y4)
ADDQ(Imm(32), RBX)
CMPQ(R9, RBX)
JNE(LabelRef("LBB7_6"))
VADDPS(Y1, Y2, Y1)
VADDPS(Y1, Y3, Y1)
VADDPS(Y1, Y4, Y1)
VEXTRACTF128(Imm(1), Y1, X2)
VADDPS(X2, X1, X1)
VPERMILPD(Imm(1), X1, X2)
VADDPS(X2, X1, X1)
VMOVSHDUP(X1, X2)
VADDSS(X2, X1, X1)
MOVQ(R9, RBX)
CMPQ(R9, R8)
JE(LabelRef("LBB7_9"))
}
Label("LBB7_8")
{
VMOVSS(Mem{Base: RDX}.Idx(RBX, 4), X2)
VFMADD231SS(Mem{Base: RSI}.Idx(RBX, 4), X2, X1)
ADDQ(Imm(1), RBX)
CMPQ(R8, RBX)
JNE(LabelRef("LBB7_8"))
JMP(LabelRef("LBB7_9"))
}
Label("LBB7_10")
{
POPQ(RBX)
VZEROUPPER()
RET()
}
}
func genMatMulTiled_F64() {
TEXT("MatMulTiled_AVX2_F64", 0, "func(x, y, z []float64, a, b, c int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
Load(Param("c"), R9)
PUSHQ(RBP)
PUSHQ(R15)
PUSHQ(R14)
PUSHQ(R13)
PUSHQ(R12)
PUSHQ(RBX)
SUBQ(Imm(72), RSP)
MOVQ(R9, Mem{Base: RSP}.Offset(-128))
MOVQ(R8, Mem{Base: RSP}.Offset(-104))
MOVQ(RDX, Mem{Base: RSP}.Offset(-88))
MOVQ(RDI, Mem{Base: RSP}.Offset(-112))
MOVQ(RCX, Mem{Base: RSP}.Offset(-64))
ADDQ(Imm(7), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(-72))
JE(LabelRef("LBB8_21"))
MOVQ(Mem{Base: RSP}.Offset(-104), RAX)
ADDQ(Imm(255), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(8))
JE(LabelRef("LBB8_21"))
MOVQ(Mem{Base: RSP}.Offset(-128), RAX)
ADDQ(Imm(255), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-40))
JE(LabelRef("LBB8_21"))
MOVQ(Mem{Base: RSP}.Offset(-88), RAX)
ADDQ(Imm(96), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-48))
MOVQ(Mem{Base: RSP}.Offset(-128), RAX)
XORQ(R15, R15)
LEAQ(Mem{Base: R15}.Idx(RAX, 8), RBX)
MOVQ(Mem{Base: RSP}.Offset(-112), RCX)
ADDQ(Imm(96), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(-96))
SHLQ(Imm(6), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-80))
XORL(EDX, EDX)
JMP(LabelRef("LBB8_4"))
Label("LBB8_20")
{
MOVQ(Mem{Base: RSP}.Offset(-80), RAX)
ADDQ(RAX, Mem{Base: RSP}.Offset(-96))
ADDQ(RAX, Mem{Base: RSP}.Offset(-112))
MOVQ(Mem{Base: RSP}.Offset(-56), RAX)
MOVQ(RAX, RDX)
CMPQ(RAX, Mem{Base: RSP}.Offset(-72))
JAE(LabelRef("LBB8_21"))
}
Label("LBB8_4")
{
LEAQ(Mem{Base: RDX}.Offset(8), RAX)
MOVQ(Mem{Base: RSP}.Offset(-64), RCX)
CMPQ(RAX, RCX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-56))
CMOVQGT(RCX, RAX)
CDQE()
MOVQ(RDX, Mem{Base: RSP}.Offset(-16))
MOVQ(RAX, Mem{Base: RSP}.Offset(24))
CMPQ(RDX, RAX)
JAE(LabelRef("LBB8_20"))
XORL(EAX, EAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-120))
MOVL(I32(256), EDX)
XORL(EAX, EAX)
JMP(LabelRef("LBB8_6"))
}
Label("LBB8_19")
{
MOVQ(Mem{Base: RSP}.Offset(-120), RAX)
ADDL(Imm(1), EAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-120))
MOVQ(Mem{Base: RSP}.Offset(-24), RDX)
ADDQ(I32(256), RDX)
MOVQ(Mem{Base: RSP}.Offset(-32), RAX)
CMPQ(RAX, Mem{Base: RSP}.Offset(-40))
JAE(LabelRef("LBB8_20"))
}
Label("LBB8_6")
{
MOVL(EAX, EDI)
MOVQ(Mem{Base: RSP}.Offset(-128), RBP)
CMPQ(RBP, RDX)
MOVQ(RDX, Mem{Base: RSP}.Offset(-24))
CMOVQLT(RBP, RDX)
ADDQ(I32(256), RAX)
CMPQ(RBP, RAX)
MOVQ(RAX, RCX)
CMOVQLT(RBP, RCX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-32))
CMOVQLT(RBP, RAX)
CMPL(EDI, EAX)
JGE(LabelRef("LBB8_19"))
MOVLQSX(EDI, R14)
MOVQ(Mem{Base: RSP}.Offset(-96), RDI)
LEAQ(Mem{Base: RDI}.Idx(R14, 8), RDI)
MOVQ(RDI, Mem{Base: RSP})
MOVLQSX(EDX, R11)
SUBQ(R14, R11)
ANDQ(I32(-16), R11)
MOVLQSX(ECX, R12)
MOVQ(Mem{Base: RSP}.Offset(-120), RCX)
SHLL(Imm(8), ECX)
MOVLQSX(ECX, RCX)
SUBQ(RCX, R12)
MOVLQSX(EAX, RDX)
MOVQ(R12, RCX)
ANDQ(I32(-16), RCX)
MOVQ(Mem{Base: RSP}.Offset(-48), RAX)
LEAQ(Mem{Base: RAX}.Idx(R14, 8), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-8))
MOVQ(R14, R13)
MOVQ(RCX, Mem{Base: RSP}.Offset(64))
ADDQ(RCX, R13)
XORL(EAX, EAX)
JMP(LabelRef("LBB8_8"))
}
Label("LBB8_18")
{
MOVQ(Mem{Base: RSP}.Offset(16), RAX)
CMPQ(RAX, Mem{Base: RSP}.Offset(8))
JAE(LabelRef("LBB8_19"))
}
Label("LBB8_8")
{
MOVL(EAX, ECX)
ADDQ(I32(256), RAX)
MOVQ(Mem{Base: RSP}.Offset(-104), RDI)
CMPQ(RAX, RDI)
MOVQ(RAX, Mem{Base: RSP}.Offset(16))
CMOVQGT(RDI, RAX)
CMPL(ECX, EAX)
JGE(LabelRef("LBB8_18"))
MOVLQSX(ECX, RDI)
MOVQ(Mem{Base: RSP}.Offset(-128), RCX)
MOVQ(RDI, Mem{Base: RSP}.Offset(48))
IMULQ(RDI, RCX)
MOVQ(Mem{Base: RSP}.Offset(-88), RDI)
LEAQ(Mem{Base: RDI}.Idx(RCX, 8), RDI)
MOVQ(RDI, Mem{Base: RSP}.Offset(40))
MOVQ(Mem{Base: RSP}.Offset(-8), RDI)
LEAQ(Mem{Base: RDI}.Idx(RCX, 8), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(32))
CDQE()
MOVQ(Mem{Base: RSP}.Offset(-112), RCX)
MOVQ(Mem{Base: RSP}, R10)
MOVQ(Mem{Base: RSP}.Offset(-16), R8)
JMP(LabelRef("LBB8_10"))
}
Label("LBB8_17")
{
MOVQ(Mem{Base: RSP}.Offset(56), R8)
ADDQ(Imm(1), R8)
ADDQ(RBX, R10)
ADDQ(RBX, RCX)
CMPQ(R8, Mem{Base: RSP}.Offset(24))
JAE(LabelRef("LBB8_18"))
}
Label("LBB8_10")
{
MOVQ(R8, Mem{Base: RSP}.Offset(56))
IMULQ(Mem{Base: RSP}.Offset(-104), R8)
MOVQ(Mem{Base: RSP}.Offset(40), R15)
MOVQ(Mem{Base: RSP}.Offset(32), RDI)
MOVQ(Mem{Base: RSP}.Offset(48), R9)
JMP(LabelRef("LBB8_11"))
}
Label("LBB8_16")
{
ADDQ(Imm(1), R9)
ADDQ(RBX, RDI)
ADDQ(RBX, R15)
CMPQ(R9, RAX)
JGE(LabelRef("LBB8_17"))
}
Label("LBB8_11")
{
LEAQ(Mem{Base: R9}.Idx(R8, 1), RBP)
VMOVSD(Mem{Base: RSI}.Idx(RBP, 8), X0)
MOVQ(R14, RBP)
CMPQ(R12, Imm(16))
JB(LabelRef("LBB8_15"))
VBROADCASTSD(X0, Y1)
XORL(EBP, EBP)
}
Label("LBB8_13")
{
VMOVUPD(Mem{Base: RDI}.Idx(RBP, 8).Offset(-96), Y2)
VMOVUPD(Mem{Base: RDI}.Idx(RBP, 8).Offset(-64), Y3)
VMOVUPD(Mem{Base: RDI}.Idx(RBP, 8).Offset(-32), Y4)
VMOVUPD(Mem{Base: RDI}.Idx(RBP, 8), Y5)
VFMADD213PD(Mem{Base: R10}.Idx(RBP, 8).Offset(-96), Y1, Y2)
VFMADD213PD(Mem{Base: R10}.Idx(RBP, 8).Offset(-64), Y1, Y3)
VFMADD213PD(Mem{Base: R10}.Idx(RBP, 8).Offset(-32), Y1, Y4)
VFMADD213PD(Mem{Base: R10}.Idx(RBP, 8), Y1, Y5)
VMOVUPD(Y2, Mem{Base: R10}.Idx(RBP, 8).Offset(-96))
VMOVUPD(Y3, Mem{Base: R10}.Idx(RBP, 8).Offset(-64))
VMOVUPD(Y4, Mem{Base: R10}.Idx(RBP, 8).Offset(-32))
VMOVUPD(Y5, Mem{Base: R10}.Idx(RBP, 8))
ADDQ(Imm(16), RBP)
CMPQ(R11, RBP)
JNE(LabelRef("LBB8_13"))
MOVQ(R13, RBP)
CMPQ(R12, Mem{Base: RSP}.Offset(64))
JE(LabelRef("LBB8_16"))
}
Label("LBB8_15")
{
VMOVSD(Mem{Base: R15}.Idx(RBP, 8), X1)
VFMADD213SD(Mem{Base: RCX}.Idx(RBP, 8), X0, X1)
VMOVSD(X1, Mem{Base: RCX}.Idx(RBP, 8))
ADDQ(Imm(1), RBP)
CMPQ(RBP, RDX)
JL(LabelRef("LBB8_15"))
JMP(LabelRef("LBB8_16"))
}
Label("LBB8_21")
{
ADDQ(Imm(72), RSP)
POPQ(RBX)
POPQ(R12)
POPQ(R13)
POPQ(R14)
POPQ(R15)
POPQ(RBP)
VZEROUPPER()
RET()
}
}
func genMatMulTiled_F32() {
TEXT("MatMulTiled_AVX2_F32", 0, "func(x, y, z []float32, a, b, c int)")
Pragma("noescape")
Load(Param("x").Base(), RDI)
Load(Param("y").Base(), RSI)
Load(Param("z").Base(), RDX)
Load(Param("a"), RCX)
Load(Param("b"), R8)
Load(Param("c"), R9)
PUSHQ(RBP)
PUSHQ(R15)
PUSHQ(R14)
PUSHQ(R13)
PUSHQ(R12)
PUSHQ(RBX)
SUBQ(Imm(72), RSP)
MOVQ(R9, Mem{Base: RSP}.Offset(-128))
MOVQ(R8, Mem{Base: RSP}.Offset(-104))
MOVQ(RDX, Mem{Base: RSP}.Offset(-88))
MOVQ(RDI, Mem{Base: RSP}.Offset(-112))
MOVQ(RCX, Mem{Base: RSP}.Offset(-64))
ADDQ(Imm(7), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(-72))
JE(LabelRef("LBB9_21"))
MOVQ(Mem{Base: RSP}.Offset(-104), RAX)
ADDQ(Imm(255), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(8))
JE(LabelRef("LBB9_21"))
MOVQ(Mem{Base: RSP}.Offset(-128), RAX)
ADDQ(Imm(255), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-40))
JE(LabelRef("LBB9_21"))
MOVQ(Mem{Base: RSP}.Offset(-88), RAX)
ADDQ(Imm(96), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-48))
MOVQ(Mem{Base: RSP}.Offset(-128), RAX)
XORQ(R15, R15)
LEAQ(Mem{Base: R15}.Idx(RAX, 4), RBX)
MOVQ(Mem{Base: RSP}.Offset(-112), RCX)
ADDQ(Imm(96), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(-96))
SHLQ(Imm(5), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-80))
XORL(EDX, EDX)
JMP(LabelRef("LBB9_4"))
Label("LBB9_20")
{
MOVQ(Mem{Base: RSP}.Offset(-80), RAX)
ADDQ(RAX, Mem{Base: RSP}.Offset(-96))
ADDQ(RAX, Mem{Base: RSP}.Offset(-112))
MOVQ(Mem{Base: RSP}.Offset(-56), RAX)
MOVQ(RAX, RDX)
CMPQ(RAX, Mem{Base: RSP}.Offset(-72))
JAE(LabelRef("LBB9_21"))
}
Label("LBB9_4")
{
LEAQ(Mem{Base: RDX}.Offset(8), RAX)
MOVQ(Mem{Base: RSP}.Offset(-64), RCX)
CMPQ(RAX, RCX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-56))
CMOVQGT(RCX, RAX)
CDQE()
MOVQ(RDX, Mem{Base: RSP}.Offset(-16))
MOVQ(RAX, Mem{Base: RSP}.Offset(24))
CMPQ(RDX, RAX)
JAE(LabelRef("LBB9_20"))
XORL(EAX, EAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-120))
MOVL(I32(256), EDX)
XORL(EAX, EAX)
JMP(LabelRef("LBB9_6"))
}
Label("LBB9_19")
{
MOVQ(Mem{Base: RSP}.Offset(-120), RAX)
ADDL(Imm(1), EAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-120))
MOVQ(Mem{Base: RSP}.Offset(-24), RDX)
ADDQ(I32(256), RDX)
MOVQ(Mem{Base: RSP}.Offset(-32), RAX)
CMPQ(RAX, Mem{Base: RSP}.Offset(-40))
JAE(LabelRef("LBB9_20"))
}
Label("LBB9_6")
{
MOVL(EAX, EDI)
MOVQ(Mem{Base: RSP}.Offset(-128), RBP)
CMPQ(RBP, RDX)
MOVQ(RDX, Mem{Base: RSP}.Offset(-24))
CMOVQLT(RBP, RDX)
ADDQ(I32(256), RAX)
CMPQ(RBP, RAX)
MOVQ(RAX, RCX)
CMOVQLT(RBP, RCX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-32))
CMOVQLT(RBP, RAX)
CMPL(EDI, EAX)
JGE(LabelRef("LBB9_19"))
MOVLQSX(EDI, R14)
MOVQ(Mem{Base: RSP}.Offset(-96), RDI)
LEAQ(Mem{Base: RDI}.Idx(R14, 4), RDI)
MOVQ(RDI, Mem{Base: RSP})
MOVLQSX(EDX, R11)
SUBQ(R14, R11)
ANDQ(I32(-32), R11)
MOVLQSX(ECX, R12)
MOVQ(Mem{Base: RSP}.Offset(-120), RCX)
SHLL(Imm(8), ECX)
MOVLQSX(ECX, RCX)
SUBQ(RCX, R12)
MOVLQSX(EAX, RDX)
MOVQ(R12, RCX)
ANDQ(I32(-32), RCX)
MOVQ(Mem{Base: RSP}.Offset(-48), RAX)
LEAQ(Mem{Base: RAX}.Idx(R14, 4), RAX)
MOVQ(RAX, Mem{Base: RSP}.Offset(-8))
MOVQ(R14, R13)
MOVQ(RCX, Mem{Base: RSP}.Offset(64))
ADDQ(RCX, R13)
XORL(EAX, EAX)
JMP(LabelRef("LBB9_8"))
}
Label("LBB9_18")
{
MOVQ(Mem{Base: RSP}.Offset(16), RAX)
CMPQ(RAX, Mem{Base: RSP}.Offset(8))
JAE(LabelRef("LBB9_19"))
}
Label("LBB9_8")
{
MOVL(EAX, ECX)
ADDQ(I32(256), RAX)
MOVQ(Mem{Base: RSP}.Offset(-104), RDI)
CMPQ(RAX, RDI)
MOVQ(RAX, Mem{Base: RSP}.Offset(16))
CMOVQGT(RDI, RAX)
CMPL(ECX, EAX)
JGE(LabelRef("LBB9_18"))
MOVLQSX(ECX, RDI)
MOVQ(Mem{Base: RSP}.Offset(-128), RCX)
MOVQ(RDI, Mem{Base: RSP}.Offset(48))
IMULQ(RDI, RCX)
MOVQ(Mem{Base: RSP}.Offset(-88), RDI)
LEAQ(Mem{Base: RDI}.Idx(RCX, 4), RDI)
MOVQ(RDI, Mem{Base: RSP}.Offset(40))
MOVQ(Mem{Base: RSP}.Offset(-8), RDI)
LEAQ(Mem{Base: RDI}.Idx(RCX, 4), RCX)
MOVQ(RCX, Mem{Base: RSP}.Offset(32))
CDQE()
MOVQ(Mem{Base: RSP}.Offset(-112), RCX)
MOVQ(Mem{Base: RSP}, R10)
MOVQ(Mem{Base: RSP}.Offset(-16), R8)
JMP(LabelRef("LBB9_10"))
}
Label("LBB9_17")
{
MOVQ(Mem{Base: RSP}.Offset(56), R8)
ADDQ(Imm(1), R8)
ADDQ(RBX, R10)
ADDQ(RBX, RCX)
CMPQ(R8, Mem{Base: RSP}.Offset(24))
JAE(LabelRef("LBB9_18"))
}
Label("LBB9_10")
{
MOVQ(R8, Mem{Base: RSP}.Offset(56))
IMULQ(Mem{Base: RSP}.Offset(-104), R8)
MOVQ(Mem{Base: RSP}.Offset(40), R15)
MOVQ(Mem{Base: RSP}.Offset(32), RDI)
MOVQ(Mem{Base: RSP}.Offset(48), R9)
JMP(LabelRef("LBB9_11"))
}
Label("LBB9_16")
{
ADDQ(Imm(1), R9)
ADDQ(RBX, RDI)
ADDQ(RBX, R15)
CMPQ(R9, RAX)
JGE(LabelRef("LBB9_17"))
}
Label("LBB9_11")
{
LEAQ(Mem{Base: R9}.Idx(R8, 1), RBP)
VMOVSS(Mem{Base: RSI}.Idx(RBP, 4), X0)
MOVQ(R14, RBP)
CMPQ(R12, Imm(32))
JB(LabelRef("LBB9_15"))
VBROADCASTSS(X0, Y1)
XORL(EBP, EBP)
}
Label("LBB9_13")
{
VMOVUPS(Mem{Base: RDI}.Idx(RBP, 4).Offset(-96), Y2)
VMOVUPS(Mem{Base: RDI}.Idx(RBP, 4).Offset(-64), Y3)
VMOVUPS(Mem{Base: RDI}.Idx(RBP, 4).Offset(-32), Y4)
VMOVUPS(Mem{Base: RDI}.Idx(RBP, 4), Y5)
VFMADD213PS(Mem{Base: R10}.Idx(RBP, 4).Offset(-96), Y1, Y2)
VFMADD213PS(Mem{Base: R10}.Idx(RBP, 4).Offset(-64), Y1, Y3)
VFMADD213PS(Mem{Base: R10}.Idx(RBP, 4).Offset(-32), Y1, Y4)
VFMADD213PS(Mem{Base: R10}.Idx(RBP, 4), Y1, Y5)
VMOVUPS(Y2, Mem{Base: R10}.Idx(RBP, 4).Offset(-96))
VMOVUPS(Y3, Mem{Base: R10}.Idx(RBP, 4).Offset(-64))
VMOVUPS(Y4, Mem{Base: R10}.Idx(RBP, 4).Offset(-32))
VMOVUPS(Y5, Mem{Base: R10}.Idx(RBP, 4))
ADDQ(Imm(32), RBP)
CMPQ(R11, RBP)
JNE(LabelRef("LBB9_13"))
MOVQ(R13, RBP)
CMPQ(R12, Mem{Base: RSP}.Offset(64))
JE(LabelRef("LBB9_16"))
}
Label("LBB9_15")
{
VMOVSS(Mem{Base: R15}.Idx(RBP, 4), X1)
VFMADD213SS(Mem{Base: RCX}.Idx(RBP, 4), X0, X1)
VMOVSS(X1, Mem{Base: RCX}.Idx(RBP, 4))
ADDQ(Imm(1), RBP)
CMPQ(RBP, RDX)
JL(LabelRef("LBB9_15"))
JMP(LabelRef("LBB9_16"))
}
Label("LBB9_21")
{
ADDQ(Imm(72), RSP)
POPQ(RBX)
POPQ(R12)
POPQ(R13)
POPQ(R14)
POPQ(R15)
POPQ(RBP)
VZEROUPPER()
RET()
}
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Go
1
https://gitee.com/quant1x/gox.git
git@gitee.com:quant1x/gox.git
quant1x
gox
gox
v1.14.0

搜索帮助

23e8dbc6 1850385 7e0993f3 1850385