diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 4ff90c1f7b3a0c89456829206f5e4f71ddf3fc83..89462f8a14c1441a4150907279163196eee607ea 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -32,10 +32,10 @@ foreach (tgt ${BOLT_TARGETS_TO_BUILD}) endforeach() set(BOLT_ENABLE_RUNTIME_default OFF) -if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" +if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" + OR CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" - OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") - AND "X86" IN_LIST BOLT_TARGETS_TO_BUILD) + OR CMAKE_SYSTEM_NAME STREQUAL "Darwin")) set(BOLT_ENABLE_RUNTIME_default ON) endif() option(BOLT_ENABLE_RUNTIME "Enable BOLT runtime" ${BOLT_ENABLE_RUNTIME_default}) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index db3f7e7f14f7a4f558400fafd955d2712b91ef6a..cd4676f370e64e27dbfad76e21e2d839f8a8b1b4 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -498,9 +498,9 @@ public: } /// Create increment contents of target by 1 for Instrumentation - virtual InstructionListType createInstrIncMemory(const MCSymbol *Target, - MCContext *Ctx, - bool IsLeaf) const { + virtual InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const { llvm_unreachable("not implemented"); return InstructionListType(); } @@ -1597,18 +1597,11 @@ public: return false; } - virtual void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, - uint32_t Imm) const { + virtual InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const { llvm_unreachable("not implemented"); } - /// Create instruction to increment contents of target by 1 - virtual bool createIncMemory(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) const { - llvm_unreachable("not implemented"); - return false; - } - /// Create a fragment of code (sequence of instructions) that load a 32-bit /// address from memory, zero-extends it to 64 and jump to it (indirect jump). virtual bool @@ -1969,7 +1962,7 @@ public: } virtual InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, - MCContext *Ctx) const { + MCContext *Ctx) { llvm_unreachable("not implemented"); return InstructionListType(); } diff --git a/bolt/lib/Passes/Instrumentation.cpp b/bolt/lib/Passes/Instrumentation.cpp index fae67704b4eabdce125d7f4372794435c0881436..98044599d497e71bf35dbf9ed1ce784a3b5271b8 100644 --- a/bolt/lib/Passes/Instrumentation.cpp +++ b/bolt/lib/Passes/Instrumentation.cpp @@ -176,7 +176,8 @@ Instrumentation::createInstrumentationSnippet(BinaryContext &BC, bool IsLeaf) { auto L = BC.scopeLock(); MCSymbol *Label = BC.Ctx->createNamedTempSymbol("InstrEntry"); Summary->Counters.emplace_back(Label); - return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf); + return BC.MIB->createInstrIncMemory(Label, BC.Ctx.get(), IsLeaf, + BC.AsmInfo->getCodePointerSize()); } // Helper instruction sequence insertion function @@ -504,9 +505,6 @@ void Instrumentation::instrumentFunction(BinaryFunction &Function, } void Instrumentation::runOnFunctions(BinaryContext &BC) { - if (!BC.isX86()) - return; - const unsigned Flags = BinarySection::getFlags(/*IsReadOnly=*/false, /*IsText=*/false, /*IsAllocatable=*/true); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index d109a5d52e66e6dc0a5a2f69fb3f4926c8a2ce38..777a1e6cc743ba54fd37e26a14073ad8f1ec6740 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -16,6 +16,9 @@ #include "Utils/AArch64BaseInfo.h" #include "bolt/Core/MCPlusBuilder.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCFixupKindInfo.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Debug.h" @@ -28,6 +31,100 @@ using namespace bolt; namespace { +static void getSystemFlag(MCInst &Inst, MCPhysReg RegName) { + Inst.setOpcode(AArch64::MRS); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(RegName)); + Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); +} + +static void setSystemFlag(MCInst &Inst, MCPhysReg RegName) { + Inst.setOpcode(AArch64::MSR); + Inst.clear(); + Inst.addOperand(MCOperand::createImm(AArch64SysReg::NZCV)); + Inst.addOperand(MCOperand::createReg(RegName)); +} + +static void createPushRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { + Inst.clear(); + unsigned NewOpcode = AArch64::STPXpre; + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(Reg1)); + Inst.addOperand(MCOperand::createReg(Reg2)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(-2)); +} + +static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { + Inst.clear(); + unsigned NewOpcode = AArch64::LDPXpost; + Inst.setOpcode(NewOpcode); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(Reg1)); + Inst.addOperand(MCOperand::createReg(Reg2)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(2)); +} + +static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { + Inst.setOpcode(AArch64::LDRXui); + Inst.clear(); + if (From == AArch64::SP) { + Inst.setOpcode(AArch64::LDRXpost); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createImm(16)); + } else { + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createImm(0)); + } +} + +static void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To) { + Inst.setOpcode(AArch64::STRXui); + Inst.clear(); + if (To == AArch64::SP) { + Inst.setOpcode(AArch64::STRXpre); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createImm(-16)); + } else { + Inst.addOperand(MCOperand::createReg(From)); + Inst.addOperand(MCOperand::createReg(To)); + Inst.addOperand(MCOperand::createImm(0)); + } +} + +static void atomicAdd(MCInst &Inst, MCPhysReg RegTo, MCPhysReg RegCnt) { + // NOTE: Supports only ARM with LSE extension + Inst.setOpcode(AArch64::LDADDX); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::XZR)); + Inst.addOperand(MCOperand::createReg(RegCnt)); + Inst.addOperand(MCOperand::createReg(RegTo)); +} + +static void createMovz(MCInst &Inst, MCPhysReg Reg, uint64_t Imm) { + assert(Imm <= UINT16_MAX && "Invalid Imm size"); + Inst.clear(); + Inst.setOpcode(AArch64::MOVZXi); + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createImm(Imm & 0xFFFF)); + Inst.addOperand(MCOperand::createImm(0)); +} + +static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) { + InstructionListType Insts; + Insts.emplace_back(); + createMovz(Insts.back(), RegTmp, 1); + Insts.emplace_back(); + atomicAdd(Insts.back(), RegTo, RegTmp); + return Insts; +} class AArch64MCPlusBuilder : public MCPlusBuilder { public: AArch64MCPlusBuilder(const MCInstrAnalysis *Analysis, const MCInstrInfo *Info, @@ -205,6 +302,40 @@ public: return Inst.getOpcode() == AArch64::BLR; } + MCPhysReg getSpRegister(int Size) const { + switch (Size) { + case 4: + return AArch64::WSP; + case 8: + return AArch64::SP; + default: + llvm_unreachable("Unexpected size"); + } + } + + MCPhysReg getIntArgRegister(unsigned ArgNo) const override { + switch (ArgNo) { + case 0: + return AArch64::X0; + case 1: + return AArch64::X1; + case 2: + return AArch64::X2; + case 3: + return AArch64::X3; + case 4: + return AArch64::X4; + case 5: + return AArch64::X5; + case 6: + return AArch64::X6; + case 7: + return AArch64::X7; + default: + return getNoRegister(); + } + } + bool hasPCRelOperand(const MCInst &Inst) const override { // ADRP is blacklisted and is an exception. Even though it has a // PC-relative operand, this operand is not a complete symbol reference @@ -816,14 +947,25 @@ public: int getUncondBranchEncodingSize() const override { return 28; } + InstructionListType createCmpJE(MCPhysReg RegNo, int64_t Imm, + const MCSymbol *Target, + MCContext *Ctx) const override { + InstructionListType Code; + Code.emplace_back(MCInstBuilder(AArch64::SUBSXri) + .addReg(RegNo) + .addReg(RegNo) + .addImm(Imm) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::Bcc) + .addImm(Imm) + .addExpr(MCSymbolRefExpr::create( + Target, MCSymbolRefExpr::VK_None, *Ctx))); + return Code; + } + bool createTailCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { - Inst.setOpcode(AArch64::B); - Inst.addOperand(MCOperand::createExpr(getTargetExprFor( - Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), - *Ctx, 0))); - setTailCall(Inst); - return true; + return createDirectCall(Inst, Target, Ctx, /*IsTailCall*/ true); } void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, @@ -872,6 +1014,18 @@ public: bool isStore(const MCInst &Inst) const override { return false; } + bool createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, + bool IsTailCall) override { + Inst.setOpcode(IsTailCall ? AArch64::B : AArch64::BL); + Inst.clear(); + Inst.addOperand(MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), + *Ctx, 0))); + if (IsTailCall) + convertJmpToTailCall(Inst); + return true; + } + bool analyzeBranch(InstructionIterator Begin, InstructionIterator End, const MCSymbol *&TBB, const MCSymbol *&FBB, MCInst *&CondBranch, @@ -1139,6 +1293,242 @@ public: return true; } + bool createStackPointerIncrement( + MCInst &Inst, int Size, + bool NoFlagsClobber = false /*unused for AArch64*/) const override { + Inst.setOpcode(AArch64::SUBXri); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(Size)); + Inst.addOperand(MCOperand::createImm(0)); + return true; + } + + bool createStackPointerDecrement( + MCInst &Inst, int Size, + bool NoFlagsClobber = false /*unused for AArch64*/) const override { + Inst.setOpcode(AArch64::ADDXri); + Inst.clear(); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createReg(AArch64::SP)); + Inst.addOperand(MCOperand::createImm(Size)); + Inst.addOperand(MCOperand::createImm(0)); + return true; + } + + void createIndirectBranch(MCInst &Inst, MCPhysReg MemBaseReg, + int64_t Disp) const { + Inst.setOpcode(AArch64::BR); + Inst.addOperand(MCOperand::createReg(MemBaseReg)); + } + + InstructionListType createInstrumentedIndCallHandlerExitBB() const override { + InstructionListType Insts(5); + // Code sequence for instrumented indirect call handler: + // msr nzcv, x1 + // ldp x0, x1, [sp], #16 + // ldr x16, [sp], #16 + // ldp x0, x1, [sp], #16 + // br x16 + setSystemFlag(Insts[0], AArch64::X1); + createPopRegisters(Insts[1], AArch64::X0, AArch64::X1); + // Here we load address of the next function which should be called in the + // original binary to X16 register. Writing to X16 is permitted without + // needing to restore. + loadReg(Insts[2], AArch64::X16, AArch64::SP); + createPopRegisters(Insts[3], AArch64::X0, AArch64::X1); + createIndirectBranch(Insts[4], AArch64::X16, 0); + return Insts; + } + + InstructionListType + createInstrumentedIndTailCallHandlerExitBB() const override { + return createInstrumentedIndCallHandlerExitBB(); + } + + InstructionListType createGetter(MCContext *Ctx, const char *name) const { + InstructionListType Insts(4); + MCSymbol *Locs = Ctx->getOrCreateSymbol(name); + InstructionListType Addr = materializeAddress(Locs, Ctx, AArch64::X0); + std::copy(Addr.begin(), Addr.end(), Insts.begin()); + assert(Addr.size() == 2 && "Invalid Addr size"); + loadReg(Insts[2], AArch64::X0, AArch64::X0); + createReturn(Insts[3]); + return Insts; + } + + InstructionListType createNumCountersGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_num_counters"); + } + + InstructionListType + createInstrLocationsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_locations"); + } + + InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_tables"); + } + + InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_num_funcs"); + } + + void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg) override { + bool IsTailCall = isTailCall(Inst); + if (IsTailCall) + removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); + if (Inst.getOpcode() == AArch64::BR || Inst.getOpcode() == AArch64::BLR) { + Inst.setOpcode(AArch64::ORRXrs); + Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); + Inst.insert(Inst.begin() + 1, MCOperand::createReg(AArch64::XZR)); + Inst.insert(Inst.begin() + 3, MCOperand::createImm(0)); + return; + } + llvm_unreachable("not implemented"); + } + + InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const override { + InstructionListType Insts(4); + int Shift = 48; + for (int I = 0; I < 4; I++, Shift -= 16) { + Insts[I].setOpcode(AArch64::MOVKXi); + Insts[I].addOperand(MCOperand::createReg(Dest)); + Insts[I].addOperand(MCOperand::createReg(Dest)); + Insts[I].addOperand(MCOperand::createImm((Imm >> Shift) & 0xFFFF)); + Insts[I].addOperand(MCOperand::createImm(Shift)); + } + return Insts; + } + + void createIndirectCallInst(MCInst &Inst, bool IsTailCall, + MCPhysReg Reg) const { + Inst.clear(); + Inst.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR); + Inst.addOperand(MCOperand::createReg(Reg)); + } + + InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, + MCSymbol *HandlerFuncAddr, + int CallSiteID, + MCContext *Ctx) override { + InstructionListType Insts; + // Code sequence used to enter indirect call instrumentation helper: + // stp x0, x1, [sp, #-16]! createPushRegisters + // mov target x0 convertIndirectCallToLoad -> orr x0 target xzr + // mov x1 CallSiteID createLoadImmediate -> + // movk x1, #0x0, lsl #48 + // movk x1, #0x0, lsl #32 + // movk x1, #0x0, lsl #16 + // movk x1, #0x0 + // stp x0, x1, [sp, #-16]! + // bl *HandlerFuncAddr createIndirectCall -> + // adr x0 *HandlerFuncAddr -> adrp + add + // blr x0 + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.emplace_back(CallInst); + convertIndirectCallToLoad(Insts.back(), AArch64::X0); + InstructionListType LoadImm = + createLoadImmediate(getIntArgRegister(1), CallSiteID); + Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.resize(Insts.size() + 2); + InstructionListType Addr = + materializeAddress(HandlerFuncAddr, Ctx, AArch64::X0); + assert(Addr.size() == 2 && "Invalid Addr size"); + std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), isTailCall(CallInst), AArch64::X0); + + // Carry over metadata including tail call marker if present. + stripAnnotations(Insts.back()); + moveAnnotations(std::move(CallInst), Insts.back()); + + return Insts; + } + + InstructionListType + createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, + const MCSymbol *IndCallHandler, + MCContext *Ctx) override { + // Code sequence used to check whether InstrTampoline was initialized + // and call it if so, returns via IndCallHandler + // stp x0, x1, [sp, #-16]! + // mrs x1, nzcv + // adr x0, InstrTrampoline -> adrp + add + // ldr x0, [x0] + // subs x0, x0, #0x0 + // b.eq IndCallHandler + // str x30, [sp, #-16]! + // blr x0 + // ldr x30, [sp], #16 + // b IndCallHandler + InstructionListType Insts; + Insts.emplace_back(); + createPushRegisters(Insts.back(), AArch64::X0, AArch64::X1); + Insts.emplace_back(); + getSystemFlag(Insts.back(), getIntArgRegister(1)); + Insts.emplace_back(); + Insts.emplace_back(); + InstructionListType Addr = + materializeAddress(InstrTrampoline, Ctx, AArch64::X0); + std::copy(Addr.begin(), Addr.end(), Insts.end() - Addr.size()); + assert(Addr.size() == 2 && "Invalid Addr size"); + Insts.emplace_back(); + loadReg(Insts.back(), AArch64::X0, AArch64::X0); + InstructionListType cmpJmp = + createCmpJE(AArch64::X0, 0, IndCallHandler, Ctx); + Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); + Insts.emplace_back(); + storeReg(Insts.back(), AArch64::LR, AArch64::SP); + Insts.emplace_back(); + Insts.back().setOpcode(AArch64::BLR); + Insts.back().addOperand(MCOperand::createReg(AArch64::X0)); + Insts.emplace_back(); + loadReg(Insts.back(), AArch64::LR, AArch64::SP); + Insts.emplace_back(); + createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); + return Insts; + } + + InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const override { + unsigned int I = 0; + InstructionListType Instrs(IsLeaf ? 12 : 10); + + if (IsLeaf) + createStackPointerIncrement(Instrs[I++], 128); + createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + getSystemFlag(Instrs[I++], AArch64::X1); + InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0); + assert(Addr.size() == 2 && "Invalid Addr size"); + std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I); + I += Addr.size(); + storeReg(Instrs[I++], AArch64::X2, AArch64::SP); + InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2); + assert(Insts.size() == 2 && "Invalid Insts size"); + std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I); + I += Insts.size(); + loadReg(Instrs[I++], AArch64::X2, AArch64::SP); + setSystemFlag(Instrs[I++], AArch64::X1); + createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1); + if (IsLeaf) + createStackPointerDecrement(Instrs[I++], 128); + return Instrs; + } + + std::vector createSymbolTrampoline(const MCSymbol *TgtSym, + MCContext *Ctx) override { + std::vector Insts; + createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); + return Insts; + } + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, MCPhysReg RegName, int64_t Addend = 0) const override { diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 3ee161d0be7b4a032b660eb5a05734b3965c4a7e..265868fbddd41a604cc42192966b1bb13962cc46 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -61,6 +61,25 @@ bool isADDri(const MCInst &Inst) { Inst.getOpcode() == X86::ADD64ri8; } +// Create instruction to increment contents of target by 1 +static InstructionListType createIncMemory(const MCSymbol *Target, + MCContext *Ctx) { + InstructionListType Insts; + Insts.emplace_back(); + Insts.back().setOpcode(X86::LOCK_INC64m); + Insts.back().clear(); + Insts.back().addOperand(MCOperand::createReg(X86::RIP)); // BaseReg + Insts.back().addOperand(MCOperand::createImm(1)); // ScaleAmt + Insts.back().addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg + + Insts.back().addOperand(MCOperand::createExpr( + MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, + *Ctx))); // Displacement + Insts.back().addOperand( + MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg + return Insts; +} + #define GET_INSTRINFO_OPERAND_TYPES_ENUM #define GET_INSTRINFO_OPERAND_TYPE #define GET_INSTRINFO_MEM_OPERAND_SIZE @@ -2309,28 +2328,15 @@ public: return true; } - void createLoadImmediate(MCInst &Inst, const MCPhysReg Dest, - uint32_t Imm) const override { - Inst.setOpcode(X86::MOV64ri32); - Inst.clear(); - Inst.addOperand(MCOperand::createReg(Dest)); - Inst.addOperand(MCOperand::createImm(Imm)); - } - - bool createIncMemory(MCInst &Inst, const MCSymbol *Target, - MCContext *Ctx) const override { - - Inst.setOpcode(X86::LOCK_INC64m); - Inst.clear(); - Inst.addOperand(MCOperand::createReg(X86::RIP)); // BaseReg - Inst.addOperand(MCOperand::createImm(1)); // ScaleAmt - Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // IndexReg - - Inst.addOperand(MCOperand::createExpr( - MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, - *Ctx))); // Displacement - Inst.addOperand(MCOperand::createReg(X86::NoRegister)); // AddrSegmentReg - return true; + InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const override { + InstructionListType Insts; + Insts.emplace_back(); + Insts.back().setOpcode(X86::MOV64ri32); + Insts.back().clear(); + Insts.back().addOperand(MCOperand::createReg(Dest)); + Insts.back().addOperand(MCOperand::createImm(Imm)); + return Insts; } bool createIJmp32Frag(SmallVectorImpl &Insts, @@ -3057,9 +3063,9 @@ public: Inst.clear(); } - InstructionListType createInstrIncMemory(const MCSymbol *Target, - MCContext *Ctx, - bool IsLeaf) const override { + InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const override { InstructionListType Instrs(IsLeaf ? 13 : 11); unsigned int I = 0; @@ -3079,7 +3085,10 @@ public: createClearRegWithNoEFlagsUpdate(Instrs[I++], X86::RAX, 8); createX86SaveOVFlagToRegister(Instrs[I++], X86::AL); // LOCK INC - createIncMemory(Instrs[I++], Target, Ctx); + InstructionListType IncMem = createIncMemory(Target, Ctx); + assert(IncMem.size() == 1 && "Invalid IncMem size"); + std::copy(IncMem.begin(), IncMem.end(), Instrs.begin() + I); + I += IncMem.size(); // POPF createAddRegImm(Instrs[I++], X86::AL, 127, 1); createPopRegister(Instrs[I++], X86::RAX, 8); @@ -3153,8 +3162,8 @@ public: } Insts.emplace_back(); createPushRegister(Insts.back(), TempReg, 8); - Insts.emplace_back(); - createLoadImmediate(Insts.back(), TempReg, CallSiteID); + InstructionListType LoadImm = createLoadImmediate(TempReg, CallSiteID); + Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); Insts.emplace_back(); createPushRegister(Insts.back(), TempReg, 8); @@ -3264,7 +3273,7 @@ public: } InstructionListType createSymbolTrampoline(const MCSymbol *TgtSym, - MCContext *Ctx) const override { + MCContext *Ctx) override { InstructionListType Insts(1); createUncondBranch(Insts[0], TgtSym, Ctx); return Insts; diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 8472ce00b41378e0413d2645e77d690094b99e96..191d2b895b926d0018bbaaaf3195da452adfbbf1 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -27,8 +27,20 @@ set(BOLT_RT_FLAGS -fno-exceptions -fno-rtti -fno-stack-protector - -mno-sse - -fPIC) + -fPIC + -mgeneral-regs-only) +if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") +endif() + +if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + include(CheckCXXCompilerFlag) + CHECK_CXX_COMPILER_FLAG("-mno-outline-atomics" + COMPILER_SUPPORTS_MNO_OUTLINE_ATOMICS) + if (COMPILER_SUPPORTS_MNO_OUTLINE_ATOMICS) + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") + endif() +endif() # Don't let the compiler think it can create calls to standard libs target_compile_options(bolt_rt_instr PRIVATE ${BOLT_RT_FLAGS}) @@ -39,7 +51,7 @@ target_include_directories(bolt_rt_hugify PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) install(TARGETS bolt_rt_instr DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") install(TARGETS bolt_rt_hugify DESTINATION "lib${LLVM_LIBDIR_SUFFIX}") -if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*") +if (CMAKE_CXX_COMPILER_ID MATCHES ".*Clang.*" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin") add_library(bolt_rt_instr_osx STATIC instr.cpp ${CMAKE_CURRENT_BINARY_DIR}/config.h diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h index 9e6f1756c57072feb5e56660104ec9ed1385b6e8..9b9965bae524eb798268fa43680d3f16958f017f 100644 --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -6,10 +6,6 @@ // //===----------------------------------------------------------------------===// -#if !defined(__x86_64__) -#error "For x86_64 only" -#endif - #if defined(__linux__) #include @@ -44,44 +40,6 @@ typedef int int32_t; #error "For Linux or MacOS only" #endif -// Save all registers while keeping 16B stack alignment -#define SAVE_ALL \ - "push %%rax\n" \ - "push %%rbx\n" \ - "push %%rcx\n" \ - "push %%rdx\n" \ - "push %%rdi\n" \ - "push %%rsi\n" \ - "push %%rbp\n" \ - "push %%r8\n" \ - "push %%r9\n" \ - "push %%r10\n" \ - "push %%r11\n" \ - "push %%r12\n" \ - "push %%r13\n" \ - "push %%r14\n" \ - "push %%r15\n" \ - "sub $8, %%rsp\n" - -// Mirrors SAVE_ALL -#define RESTORE_ALL \ - "add $8, %%rsp\n" \ - "pop %%r15\n" \ - "pop %%r14\n" \ - "pop %%r13\n" \ - "pop %%r12\n" \ - "pop %%r11\n" \ - "pop %%r10\n" \ - "pop %%r9\n" \ - "pop %%r8\n" \ - "pop %%rbp\n" \ - "pop %%rsi\n" \ - "pop %%rdi\n" \ - "pop %%rdx\n" \ - "pop %%rcx\n" \ - "pop %%rbx\n" \ - "pop %%rax\n" - #define PROT_READ 0x1 /* Page can be read. */ #define PROT_WRITE 0x2 /* Page can be written. */ #define PROT_EXEC 0x4 /* Page can be executed. */ @@ -165,141 +123,41 @@ int memcmp(const void *s1, const void *s2, size_t n) { // Anonymous namespace covering everything but our library entry point namespace { -// Get the difference between runtime addrress of .text section and -// static address in section header table. Can be extracted from arbitrary -// pc value recorded at runtime to get the corresponding static address, which -// in turn can be used to search for indirect call description. Needed because -// indirect call descriptions are read-only non-relocatable data. -uint64_t getTextBaseAddress() { - uint64_t DynAddr; - uint64_t StaticAddr; - __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" - "movabsq $__hot_end, %1\n\t" - : "=r"(DynAddr), "=r"(StaticAddr)); - return DynAddr - StaticAddr; -} - -constexpr uint32_t BufSize = 10240; - -#define _STRINGIFY(x) #x -#define STRINGIFY(x) _STRINGIFY(x) - -uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define READ_SYSCALL 0x2000003 -#else -#define READ_SYSCALL 0 -#endif - __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { - uint64_t ret; -#if defined(__APPLE__) -#define WRITE_SYSCALL 0x2000004 -#else -#define WRITE_SYSCALL 1 -#endif - __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(buf), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, - uint64_t fd, uint64_t offset) { -#if defined(__APPLE__) -#define MMAP_SYSCALL 0x20000c5 -#else -#define MMAP_SYSCALL 9 -#endif - void *ret; - register uint64_t r8 asm("r8") = fd; - register uint64_t r9 asm("r9") = offset; - register uint64_t r10 asm("r10") = flags; - __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), - "r"(r9) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __munmap(void *addr, uint64_t size) { -#if defined(__APPLE__) -#define MUNMAP_SYSCALL 0x2000049 -#else -#define MUNMAP_SYSCALL 11 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(size) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct dirent64 { + uint64_t d_ino; /* Inode number */ + int64_t d_off; /* Offset to next linux_dirent */ + unsigned short d_reclen; /* Length of this linux_dirent */ + unsigned char d_type; + char d_name[]; /* Filename (null-terminated) */ + /* length is actually (d_reclen - 2 - + offsetof(struct linux_dirent, d_name)) */ +}; -#define SIG_BLOCK 0 -#define SIG_UNBLOCK 1 -#define SIG_SETMASK 2 +/* Length of the entries in `struct utsname' is 65. */ +#define _UTSNAME_LENGTH 65 -static const uint64_t MaskAllSignals[] = {-1ULL}; +struct UtsNameTy { + char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ + char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined + network" */ + char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ + char version[_UTSNAME_LENGTH]; /* Operating system version */ + char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ + char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ +}; -uint64_t __sigprocmask(int how, const void *set, void *oldset) { -#if defined(__APPLE__) -#define SIGPROCMASK_SYSCALL 0x2000030 -#else -#define SIGPROCMASK_SYSCALL 14 -#endif - uint64_t ret; - register long r10 asm("r10") = sizeof(uint64_t); - __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(how), "S"(set), "d"(oldset), "r"(r10) - : "cc", "rcx", "r11", "memory"); - return ret; -} +struct timespec { + uint64_t tv_sec; /* seconds */ + uint64_t tv_nsec; /* nanoseconds */ +}; -uint64_t __getpid() { - uint64_t ret; -#if defined(__APPLE__) -#define GETPID_SYSCALL 20 +#if defined(__aarch64__) +#include "sys_aarch64.h" #else -#define GETPID_SYSCALL 39 +#include "sys_x86_64.h" #endif - __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} -uint64_t __exit(uint64_t code) { -#if defined(__APPLE__) -#define EXIT_SYSCALL 0x2000001 -#else -#define EXIT_SYSCALL 231 -#endif - uint64_t ret; - __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(code) - : "cc", "rcx", "r11", "memory"); - return ret; -} +constexpr uint32_t BufSize = 10240; // Helper functions for writing strings to the .fdata file. We intentionally // avoid using libc names to make it clear it is our impl. @@ -415,219 +273,6 @@ static bool scanUInt32(const char *&Buf, const char *End, uint32_t &Ret) { return false; } -#if !defined(__APPLE__) -// We use a stack-allocated buffer for string manipulation in many pieces of -// this code, including the code that prints each line of the fdata file. This -// buffer needs to accomodate large function names, but shouldn't be arbitrarily -// large (dynamically allocated) for simplicity of our memory space usage. - -// Declare some syscall wrappers we use throughout this code to avoid linking -// against system libc. -uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { - uint64_t ret; - __asm__ __volatile__("movq $2, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(flags), "d"(mode) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -struct dirent { - unsigned long d_ino; /* Inode number */ - unsigned long d_off; /* Offset to next linux_dirent */ - unsigned short d_reclen; /* Length of this linux_dirent */ - char d_name[]; /* Filename (null-terminated) */ - /* length is actually (d_reclen - 2 - - offsetof(struct linux_dirent, d_name)) */ -}; - -long __getdents(unsigned int fd, dirent *dirp, size_t count) { - long ret; - __asm__ __volatile__("movq $78, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(fd), "S"(dirp), "d"(count) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { - uint64_t ret; - __asm__ __volatile__("movq $89, %%rax\n" - "syscall" - : "=a"(ret) - : "D"(pathname), "S"(buf), "d"(bufsize) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { - uint64_t ret; - __asm__ __volatile__("movq $8, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(pos), "d"(whence) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __ftruncate(uint64_t fd, uint64_t length) { - int ret; - __asm__ __volatile__("movq $77, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd), "S"(length) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __close(uint64_t fd) { - uint64_t ret; - __asm__ __volatile__("movq $3, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __madvise(void *addr, size_t length, int advice) { - int ret; - __asm__ __volatile__("movq $28, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(length), "d"(advice) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -#define _UTSNAME_LENGTH 65 - -struct UtsNameTy { - char sysname[_UTSNAME_LENGTH]; /* Operating system name (e.g., "Linux") */ - char nodename[_UTSNAME_LENGTH]; /* Name within "some implementation-defined - network" */ - char release[_UTSNAME_LENGTH]; /* Operating system release (e.g., "2.6.28") */ - char version[_UTSNAME_LENGTH]; /* Operating system version */ - char machine[_UTSNAME_LENGTH]; /* Hardware identifier */ - char domainname[_UTSNAME_LENGTH]; /* NIS or YP domain name */ -}; - -int __uname(struct UtsNameTy *Buf) { - int Ret; - __asm__ __volatile__("movq $63, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Buf) - : "cc", "rcx", "r11", "memory"); - return Ret; -} - -struct timespec { - uint64_t tv_sec; /* seconds */ - uint64_t tv_nsec; /* nanoseconds */ -}; - -uint64_t __nanosleep(const timespec *req, timespec *rem) { - uint64_t ret; - __asm__ __volatile__("movq $35, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(req), "S"(rem) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int64_t __fork() { - uint64_t ret; - __asm__ __volatile__("movq $57, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __mprotect(void *addr, size_t len, int prot) { - int ret; - __asm__ __volatile__("movq $10, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(addr), "S"(len), "d"(prot) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getppid() { - uint64_t ret; - __asm__ __volatile__("movq $110, %%rax\n" - "syscall\n" - : "=a"(ret) - : - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __setpgid(uint64_t pid, uint64_t pgid) { - int ret; - __asm__ __volatile__("movq $109, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(pgid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -uint64_t __getpgid(uint64_t pid) { - uint64_t ret; - __asm__ __volatile__("movq $121, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __kill(uint64_t pid, int sig) { - int ret; - __asm__ __volatile__("movq $62, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(pid), "S"(sig) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -int __fsync(int fd) { - int ret; - __asm__ __volatile__("movq $74, %%rax\n" - "syscall\n" - : "=a"(ret) - : "D"(fd) - : "cc", "rcx", "r11", "memory"); - return ret; -} - -// %rdi %rsi %rdx %r10 %r8 -// sys_prctl int option unsigned unsigned unsigned unsigned -// long arg2 long arg3 long arg4 long arg5 -int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, - unsigned long Arg4, unsigned long Arg5) { - int Ret; - register long rdx asm("rdx") = Arg3; - register long r8 asm("r8") = Arg5; - register long r10 asm("r10") = Arg4; - __asm__ __volatile__("movq $157, %%rax\n" - "syscall\n" - : "=a"(Ret) - : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) - :); - return Ret; -} - -#endif - void reportError(const char *Msg, uint64_t Size) { __write(2, Msg, Size); __exit(1); @@ -644,6 +289,12 @@ void assert(bool Assertion, const char *Msg) { reportError(Buf, Ptr - Buf); } +#define SIG_BLOCK 0 +#define SIG_UNBLOCK 1 +#define SIG_SETMASK 2 + +static const uint64_t MaskAllSignals[] = {-1ULL}; + class Mutex { volatile bool InUse{false}; diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp index 96a43f685befac008c84d4c0c23e89ab76beecad..cfd113e805c500c26e23de010a426dac6414c8f0 100644 --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -40,7 +40,6 @@ // //===----------------------------------------------------------------------===// -#if defined (__x86_64__) #include "common.h" // Enables a very verbose logging to stderr useful when debugging @@ -695,12 +694,12 @@ static char *getBinaryPath() { assert(static_cast(FDdir) >= 0, "failed to open /proc/self/map_files"); - while (long Nread = __getdents(FDdir, (struct dirent *)Buf, BufSize)) { + while (long Nread = __getdents64(FDdir, (struct dirent64 *)Buf, BufSize)) { assert(static_cast(Nread) != -1, "failed to get folder entries"); - struct dirent *d; + struct dirent64 *d; for (long Bpos = 0; Bpos < Nread; Bpos += d->d_reclen) { - d = (struct dirent *)(Buf + Bpos); + d = (struct dirent64 *)(Buf + Bpos); uint64_t StartAddress, EndAddress; if (!parseAddressRange(d->d_name, StartAddress, EndAddress)) @@ -1668,6 +1667,17 @@ instrumentIndirectCall(uint64_t Target, uint64_t IndCallID) { /// as well as the target address for the call extern "C" __attribute((naked)) void __bolt_instr_indirect_call() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0xa0(%%rsp), %%rdi\n" "mov 0x98(%%rsp), %%rsi\n" @@ -1675,10 +1685,23 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "ldp x0, x1, [sp, #288]\n" + "bl instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "mov 0x98(%%rsp), %%rdi\n" "mov 0x90(%%rsp), %%rsi\n" @@ -1686,21 +1709,48 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() RESTORE_ALL "ret\n" :::); + // clang-format on +#endif } /// This is hooking ELF's entry, it needs to save all machine state. extern "C" __attribute((naked)) void __bolt_instr_start() { +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "bl __bolt_instr_setup\n" + RESTORE_ALL + "adrp x16, __bolt_start_trampoline\n" + "add x16, x16, #:lo12:__bolt_start_trampoline\n" + "br x16\n" + :::); + // clang-format on +#else + // clang-format off __asm__ __volatile__(SAVE_ALL "call __bolt_instr_setup\n" RESTORE_ALL "jmp __bolt_start_trampoline\n" :::); + // clang-format on +#endif } /// This is hooking into ELF's DT_FINI extern "C" void __bolt_instr_fini() { - __bolt_fini_trampoline(); +#if defined(__aarch64__) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "adrp x16, __bolt_fini_trampoline\n" + "add x16, x16, #:lo12:__bolt_fini_trampoline\n" + "blr x16\n" + RESTORE_ALL + :::); + // clang-format on +#else + __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); +#endif if (__bolt_instr_sleep_time == 0) { int FD = openProfile(); __bolt_instr_data_dump(FD); @@ -1752,4 +1802,3 @@ void _bolt_instr_fini() { } #endif -#endif diff --git a/bolt/runtime/sys_aarch64.h b/bolt/runtime/sys_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..77c9cfcc99f9807ba975f747403514549cb076bd --- /dev/null +++ b/bolt/runtime/sys_aarch64.h @@ -0,0 +1,394 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_AARCH64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "stp x0, x1, [sp, #-16]!\n" \ + "stp x2, x3, [sp, #-16]!\n" \ + "stp x4, x5, [sp, #-16]!\n" \ + "stp x6, x7, [sp, #-16]!\n" \ + "stp x8, x9, [sp, #-16]!\n" \ + "stp x10, x11, [sp, #-16]!\n" \ + "stp x12, x13, [sp, #-16]!\n" \ + "stp x14, x15, [sp, #-16]!\n" \ + "stp x16, x17, [sp, #-16]!\n" \ + "stp x18, x19, [sp, #-16]!\n" \ + "stp x20, x21, [sp, #-16]!\n" \ + "stp x22, x23, [sp, #-16]!\n" \ + "stp x24, x25, [sp, #-16]!\n" \ + "stp x26, x27, [sp, #-16]!\n" \ + "stp x28, x29, [sp, #-16]!\n" \ + "str x30, [sp,#-16]!\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "ldr x30, [sp], #16\n" \ + "ldp x28, x29, [sp], #16\n" \ + "ldp x26, x27, [sp], #16\n" \ + "ldp x24, x25, [sp], #16\n" \ + "ldp x22, x23, [sp], #16\n" \ + "ldp x20, x21, [sp], #16\n" \ + "ldp x18, x19, [sp], #16\n" \ + "ldp x16, x17, [sp], #16\n" \ + "ldp x14, x15, [sp], #16\n" \ + "ldp x12, x13, [sp], #16\n" \ + "ldp x10, x11, [sp], #16\n" \ + "ldp x8, x9, [sp], #16\n" \ + "ldp x6, x7, [sp], #16\n" \ + "ldp x4, x5, [sp], #16\n" \ + "ldp x2, x3, [sp], #16\n" \ + "ldp x0, x1, [sp], #16\n" + +// Anonymous namespace covering everything but our library entry point +namespace { + +// Get the difference between runtime addrress of .text section and +// static address in section header table. Can be extracted from arbitrary +// pc value recorded at runtime to get the corresponding static address, which +// in turn can be used to search for indirect call description. Needed because +// indirect call descriptions are read-only non-relocatable data. +uint64_t getTextBaseAddress() { + uint64_t DynAddr; + uint64_t StaticAddr; + __asm__ volatile("b .instr%=\n\t" + ".StaticAddr%=:\n\t" + ".dword __hot_end\n\t" + ".instr%=:\n\t" + "ldr %0, .StaticAddr%=\n\t" + "adrp %1, __hot_end\n\t" + "add %1, %1, :lo12:__hot_end\n\t" + : "=r"(StaticAddr), "=r"(DynAddr)); + return DynAddr - StaticAddr; +} + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 63; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register const void *x1 __asm__("x1") = buf; + register uint64_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 64; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { + void *ret; + register uint64_t x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint64_t x2 __asm__("x2") = prot; + register uint64_t x3 __asm__("x3") = flags; + register uint64_t x4 __asm__("x4") = fd; + register uint64_t x5 __asm__("x5") = offset; + register uint32_t w8 __asm__("w8") = 222; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(x5), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { + uint64_t ret; + register void *x0 __asm__("x0") = addr; + register uint64_t x1 __asm__("x1") = size; + register uint32_t w8 __asm__("w8") = 215; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = code; + register uint32_t w8 __asm__("w8") = 94; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register uint64_t x2 __asm__("x2") = flags; + register uint64_t x3 __asm__("x3") = mode; + register uint32_t w8 __asm__("w8") = 56; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + register unsigned int x0 __asm__("x0") = fd; + register dirent64 *x1 __asm__("x1") = dirp; + register size_t x2 __asm__("x2") = count; + register uint32_t w8 __asm__("w8") = 61; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + register int x0 __asm__("x0") = -100; + register const char *x1 __asm__("x1") = pathname; + register char *x2 __asm__("x2") = buf; + register size_t x3 __asm__("x3") = bufsize; + register uint32_t w8 __asm__("w8") = 78; // readlinkat + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = fd; + register uint64_t x1 __asm__("x1") = pos; + register uint64_t x2 __asm__("x2") = whence; + register uint32_t w8 __asm__("w8") = 62; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __ftruncate(uint64_t fd, uint64_t length) { + int ret; + register uint64_t x0 __asm__("x0") = fd; + register uint64_t x1 __asm__("x1") = length; + register uint32_t w8 __asm__("w8") = 46; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int __close(uint64_t fd) { + int ret; + register uint64_t x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 57; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = length; + register int x2 __asm__("x2") = advice; + register uint32_t w8 __asm__("w8") = 233; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *buf) { + int ret; + register UtsNameTy *x0 __asm__("x0") = buf; + register uint32_t w8 __asm__("w8") = 160; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + register const timespec *x0 __asm__("x0") = req; + register timespec *x1 __asm__("x1") = rem; + register uint32_t w8 __asm__("w8") = 101; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + // clone instead of fork with flags + // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" + register uint64_t x0 __asm__("x0") = 0x1200011; + register uint64_t x1 __asm__("x1") = 0; + register uint64_t x2 __asm__("x2") = 0; + register uint64_t x3 __asm__("x3") = 0; + register uint64_t x4 __asm__("x4") = 0; + register uint32_t w8 __asm__("w8") = 220; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + register void *x0 __asm__("x0") = addr; + register size_t x1 __asm__("x1") = len; + register int x2 __asm__("x2") = prot; + register uint32_t w8 __asm__("w8") = 226; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 172; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + register uint32_t w8 __asm__("w8") = 173; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret) + : "r"(w8) + : "cc", "memory", "x0", "x1"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register uint64_t x1 __asm__("x1") = pgid; + register uint32_t w8 __asm__("w8") = 154; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + register uint64_t x0 __asm__("x0") = pid; + register uint32_t w8 __asm__("w8") = 155; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + register uint64_t x0 __asm__("x0") = pid; + register int x1 __asm__("x1") = sig; + register uint32_t w8 __asm__("w8") = 129; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(w8) + : "cc", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + register int x0 __asm__("x0") = fd; + register uint32_t w8 __asm__("w8") = 82; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0) + : "r"(w8) + : "cc", "memory", "x1"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { + uint64_t ret; + register int x0 __asm__("x0") = how; + register const void *x1 __asm__("x1") = set; + register void *x2 __asm__("x2") = oldset; + register long x3 asm("x3") = 8; + register uint32_t w8 __asm__("w8") = 135; + __asm__ __volatile__("svc #0\n" + "mov %0, x0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(w8) + : "cc", "memory"); + return ret; +} + +int __prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) { + int ret; + register int x0 __asm__("x0") = option; + register unsigned long x1 __asm__("x1") = arg2; + register unsigned long x2 __asm__("x2") = arg3; + register unsigned long x3 __asm__("x3") = arg4; + register unsigned long x4 __asm__("x4") = arg5; + register uint32_t w8 __asm__("w8") = 167; + __asm__ __volatile__("svc #0\n" + "mov %w0, w0" + : "=r"(ret), "+r"(x0), "+r"(x1) + : "r"(x2), "r"(x3), "r"(x4), "r"(w8) + : "cc", "memory"); + return ret; +} + +} // anonymous namespace + +#endif diff --git a/bolt/runtime/sys_x86_64.h b/bolt/runtime/sys_x86_64.h new file mode 100644 index 0000000000000000000000000000000000000000..ca2c69326a14f3cb1131cf6b3bcda4458a98aa45 --- /dev/null +++ b/bolt/runtime/sys_x86_64.h @@ -0,0 +1,360 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 +#define LLVM_TOOLS_LLVM_BOLT_SYS_X86_64 + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "push %%rax\n" \ + "push %%rbx\n" \ + "push %%rcx\n" \ + "push %%rdx\n" \ + "push %%rdi\n" \ + "push %%rsi\n" \ + "push %%rbp\n" \ + "push %%r8\n" \ + "push %%r9\n" \ + "push %%r10\n" \ + "push %%r11\n" \ + "push %%r12\n" \ + "push %%r13\n" \ + "push %%r14\n" \ + "push %%r15\n" \ + "sub $8, %%rsp\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "add $8, %%rsp\n" \ + "pop %%r15\n" \ + "pop %%r14\n" \ + "pop %%r13\n" \ + "pop %%r12\n" \ + "pop %%r11\n" \ + "pop %%r10\n" \ + "pop %%r9\n" \ + "pop %%r8\n" \ + "pop %%rbp\n" \ + "pop %%rsi\n" \ + "pop %%rdi\n" \ + "pop %%rdx\n" \ + "pop %%rcx\n" \ + "pop %%rbx\n" \ + "pop %%rax\n" + +namespace { + +// Get the difference between runtime addrress of .text section and +// static address in section header table. Can be extracted from arbitrary +// pc value recorded at runtime to get the corresponding static address, which +// in turn can be used to search for indirect call description. Needed because +// indirect call descriptions are read-only non-relocatable data. +uint64_t getTextBaseAddress() { + uint64_t DynAddr; + uint64_t StaticAddr; + __asm__ volatile("leaq __hot_end(%%rip), %0\n\t" + "movabsq $__hot_end, %1\n\t" + : "=r"(DynAddr), "=r"(StaticAddr)); + return DynAddr - StaticAddr; +} + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define READ_SYSCALL 0x2000003 +#else +#define READ_SYSCALL 0 +#endif + __asm__ __volatile__("movq $" STRINGIFY(READ_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; +#if defined(__APPLE__) +#define WRITE_SYSCALL 0x2000004 +#else +#define WRITE_SYSCALL 1 +#endif + __asm__ __volatile__("movq $" STRINGIFY(WRITE_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(buf), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { +#if defined(__APPLE__) +#define MMAP_SYSCALL 0x20000c5 +#else +#define MMAP_SYSCALL 9 +#endif + void *ret; + register uint64_t r8 asm("r8") = fd; + register uint64_t r9 asm("r9") = offset; + register uint64_t r10 asm("r10") = flags; + __asm__ __volatile__("movq $" STRINGIFY(MMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size), "d"(prot), "r"(r10), "r"(r8), + "r"(r9) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { +#if defined(__APPLE__) +#define MUNMAP_SYSCALL 0x2000049 +#else +#define MUNMAP_SYSCALL 11 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(MUNMAP_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(size) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { +#if defined(__APPLE__) +#define SIGPROCMASK_SYSCALL 0x2000030 +#else +#define SIGPROCMASK_SYSCALL 14 +#endif + uint64_t ret; + register long r10 asm("r10") = sizeof(uint64_t); + __asm__ __volatile__("movq $" STRINGIFY(SIGPROCMASK_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(how), "S"(set), "d"(oldset), "r"(r10) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; +#if defined(__APPLE__) +#define GETPID_SYSCALL 20 +#else +#define GETPID_SYSCALL 39 +#endif + __asm__ __volatile__("movq $" STRINGIFY(GETPID_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { +#if defined(__APPLE__) +#define EXIT_SYSCALL 0x2000001 +#else +#define EXIT_SYSCALL 231 +#endif + uint64_t ret; + __asm__ __volatile__("movq $" STRINGIFY(EXIT_SYSCALL) ", %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(code) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +#if !defined(__APPLE__) +// We use a stack-allocated buffer for string manipulation in many pieces of +// this code, including the code that prints each line of the fdata file. This +// buffer needs to accomodate large function names, but shouldn't be arbitrarily +// large (dynamically allocated) for simplicity of our memory space usage. + +// Declare some syscall wrappers we use throughout this code to avoid linking +// against system libc. +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + __asm__ __volatile__("movq $2, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(flags), "d"(mode) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + __asm__ __volatile__("movq $217, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(fd), "S"(dirp), "d"(count) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + __asm__ __volatile__("movq $89, %%rax\n" + "syscall" + : "=a"(ret) + : "D"(pathname), "S"(buf), "d"(bufsize) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + __asm__ __volatile__("movq $8, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(pos), "d"(whence) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __ftruncate(uint64_t fd, uint64_t length) { + int ret; + __asm__ __volatile__("movq $77, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd), "S"(length) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __close(uint64_t fd) { + uint64_t ret; + __asm__ __volatile__("movq $3, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + __asm__ __volatile__("movq $28, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(length), "d"(advice) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __uname(struct UtsNameTy *Buf) { + int Ret; + __asm__ __volatile__("movq $63, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Buf) + : "cc", "rcx", "r11", "memory"); + return Ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + __asm__ __volatile__("movq $35, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(req), "S"(rem) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + __asm__ __volatile__("movq $57, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + __asm__ __volatile__("movq $10, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(addr), "S"(len), "d"(prot) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + __asm__ __volatile__("movq $110, %%rax\n" + "syscall\n" + : "=a"(ret) + : + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + __asm__ __volatile__("movq $109, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(pgid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + __asm__ __volatile__("movq $121, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + __asm__ __volatile__("movq $62, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(pid), "S"(sig) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + __asm__ __volatile__("movq $74, %%rax\n" + "syscall\n" + : "=a"(ret) + : "D"(fd) + : "cc", "rcx", "r11", "memory"); + return ret; +} + +// %rdi %rsi %rdx %r10 %r8 +// sys_prctl int option unsigned unsigned unsigned unsigned +// long arg2 long arg3 long arg4 long arg5 +int __prctl(int Option, unsigned long Arg2, unsigned long Arg3, + unsigned long Arg4, unsigned long Arg5) { + int Ret; + register long rdx asm("rdx") = Arg3; + register long r8 asm("r8") = Arg5; + register long r10 asm("r10") = Arg4; + __asm__ __volatile__("movq $157, %%rax\n" + "syscall\n" + : "=a"(Ret) + : "D"(Option), "S"(Arg2), "d"(rdx), "r"(r10), "r"(r8) + :); + return Ret; +} + +#endif + +} // anonymous namespace + +#endif