diff --git a/llvm/include/llvm/Transforms/Scalar/PGOPrefetch.h b/llvm/include/llvm/Transforms/Scalar/PGOPrefetch.h new file mode 100644 index 0000000000000000000000000000000000000000..0f79b0c684e022dda37fa276baf974de7b72bbb4 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/PGOPrefetch.h @@ -0,0 +1,32 @@ +//===------- PGOPrefetch.h - Profile-Guilded Prefetching Pass ---*- C++ -*-===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// \file +/// This file provides the interface for Profile-Guilded Prefetching Pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_SCALAR_PGOPREFETCH_H +#define LLVM_TRANSFORMS_SCALAR_PGOPREFETCH_H + +#include "llvm/IR/PassManager.h" +#include "llvm/Support/CommandLine.h" + +namespace llvm { + +extern cl::opt EnablePGOPrefetch; + +class PGOPrefetchPass : public PassInfoMixin { +public: + PGOPrefetchPass() = default; + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_SCALAR_PGOPREFETCH_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 2ae9bb8f8d658be12381941655d167c74e01d34f..80cb688b1f21dab5c9cdd52a5a4f54b970ca9de5 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -208,6 +208,7 @@ #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NaryReassociate.h" #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/Transforms/Scalar/PGOPrefetch.h" #include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h" #include "llvm/Transforms/Scalar/PlaceSafepoints.h" #include "llvm/Transforms/Scalar/Reassociate.h" diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index f48c7ac57248c323a5fe50cf0ebe35e7cbe19ad3..c000bf0e0c7335c07ce34d77ea7ef2154190d1a4 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -112,6 +112,7 @@ #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h" #include "llvm/Transforms/Scalar/NewGVN.h" +#include "llvm/Transforms/Scalar/PGOPrefetch.h" #include "llvm/Transforms/Scalar/Reassociate.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Scalar/SROA.h" @@ -1281,6 +1282,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, MPM.addPass(GlobalOptPass()); MPM.addPass(GlobalDCEPass()); + if (EnablePGOPrefetch) + MPM.addPass(createModuleToFunctionPassAdaptor(PGOPrefetchPass())); return MPM; } @@ -1930,6 +1933,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, MPM.addPass(createModuleToFunctionPassAdaptor(std::move(PeepholeFPM), PTO.EagerlyInvalidateAnalyses)); + if (EnablePGOPrefetch) + MPM.addPass(createModuleToFunctionPassAdaptor(PGOPrefetchPass())); + // Note: historically, the PruneEH pass was run first to deduce nounwind and // generally clean up exception handling overhead. It isn't clear this is // valuable as the inliner doesn't currently care whether it is inlining an diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index f6df0fcb7f803fb6b9f66a8e8d499e33246ae1fe..94b2af6e281b89ed1378b1fadb7fbc13183c64f0 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -394,6 +394,7 @@ FUNCTION_PASS("objc-arc-contract", ObjCARCContractPass()) FUNCTION_PASS("objc-arc-expand", ObjCARCExpandPass()) FUNCTION_PASS("pa-eval", PAEvalPass()) FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt()) +FUNCTION_PASS("pgo-prefetch", PGOPrefetchPass()) FUNCTION_PASS("place-safepoints", PlaceSafepointsPass()) FUNCTION_PASS("print", PrintFunctionPass(dbgs())) FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index e5a82ea8f923ff5527626d1ff900ebbc565f5b29..fb31faa1c185f1ac3073fbd60a9d3533a8694126 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -59,6 +59,7 @@ add_llvm_component_library(LLVMScalarOpts MergedLoadStoreMotion.cpp NaryReassociate.cpp NewGVN.cpp + PGOPrefetch.cpp PartiallyInlineLibCalls.cpp PlaceSafepoints.cpp Reassociate.cpp diff --git a/llvm/lib/Transforms/Scalar/PGOPrefetch.cpp b/llvm/lib/Transforms/Scalar/PGOPrefetch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5410f532f7fc13bf1347aefc33fa123b9c10117f --- /dev/null +++ b/llvm/lib/Transforms/Scalar/PGOPrefetch.cpp @@ -0,0 +1,149 @@ +//===-------- PGOPrefetch.cpp - Profile-Guided Prefetching Pass -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a Profile-Guilded Prefetching Pass. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/PGOPrefetch.h" + +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/VirtualFileSystem.h" + +#define DEBUG_TYPE "pgo-prefetch" + +using namespace llvm; + +namespace llvm { + +cl::opt EnablePGOPrefetch("enable-pgo-prefetch", cl::init(false), + cl::ReallyHidden, cl::desc("Enable PGO Prefetch Pass")); + +static cl::opt PrefetchHintsFile("pgo-prefetch-hints-file", + cl::desc("Path to the prefetch hints profile"), cl::Hidden, + cl::callback([](const std::string &Path) { + if (!Path.empty()) + EnablePGOPrefetch = true; + })); + +class PGOPrefetch { +public: + PGOPrefetch(LLVMContext &Ctx, const std::string &FilePath); + bool run(Function *M); + +private: + std::unique_ptr Reader; + + void readAfdoFile(LLVMContext &Ctx, const std::string &FilePath); + bool doByteOffsetPrefetch(LoadInst *I, uint64_t PrefetchDistance); +}; + +void PGOPrefetch::readAfdoFile(LLVMContext &Ctx, const std::string &FilePath) { + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = SampleProfileReader::create(FilePath, Ctx, *FS); + if (auto EC = ReaderOrErr.getError()) { + errs() << "Could not open sample profile " << FilePath << "\n"; + Reader = nullptr; + return; + } + + Reader = std::move(ReaderOrErr.get()); + Reader->read(); +} + +PGOPrefetch::PGOPrefetch(LLVMContext &Ctx, const std::string &FilePath) { + readAfdoFile(Ctx, FilePath); +} + +bool PGOPrefetch::run(Function *F) { + if (!Reader || F->isDeclaration()) + return false; + + FunctionSamples *Samples = Reader->getSamplesFor(F->getName()); + if (!Samples) + return false; + + LLVM_DEBUG(dbgs() << "Found samples for function " << F->getName() << '\n'); + + bool Changed = false; + for (auto It = inst_begin(F); It != inst_end(F); ++It) { + Instruction &I = *It; + + LoadInst *LI = dyn_cast(&I); + if (!LI) + continue; + + const DILocation *DIL = I.getDebugLoc().get(); + if (!DIL) + continue; + + uint32_t Off = FunctionSamples::getOffset(DIL); + uint32_t Dis = DIL->getBaseDiscriminator(); + auto *FunctionSamples = Samples->findFunctionSamples(DIL); + if (!FunctionSamples) + continue; + + LLVM_DEBUG(dbgs() << "Instruction: " << *LI << '\n'); + LLVM_DEBUG(dbgs() << "Off: " << Off << ", Dis: " << Dis << '\n'); + + auto PrefetcherType = FunctionSamples->findCallTargetMapAt(Off, Dis); + if (!PrefetcherType) + continue; + + for (auto &KV : PrefetcherType.get()) { + auto Type = KV.first(); + if (Type == "__load") + Changed |= doByteOffsetPrefetch(LI, KV.second); + else + errs() << "Unsupported prefetcher type: " << Type << '\n'; + } + } + return Changed; +} + +bool PGOPrefetch::doByteOffsetPrefetch(LoadInst *I, uint64_t PrefetchDistance) { + LLVM_DEBUG(dbgs() << "Prefetch distance = " << PrefetchDistance + << "for load instruction: " << *I << '\n'); + + Value *PtrValue = I->getPointerOperand(); + IRBuilder<> Builder(I); + + Type *I8 = Type::getInt8Ty(I->getContext()); + Type *I32 = Type::getInt32Ty(I->getContext()); + Type *I64 = Type::getInt64Ty(I->getContext()); + + PointerType *I8PtrTy = PointerType::get(I8, + PtrValue->getType()->getPointerAddressSpace()); + Value *LoadPtrI8 = Builder.CreateBitCast(PtrValue, I8PtrTy, "load_ptr_i8"); + Value *PrefetchPtr = Builder.CreateGEP(I8, LoadPtrI8, + ConstantInt::get(I64, PrefetchDistance), "prefetch_ptr"); + + Function *PrefetchFunc = Intrinsic::getDeclaration( + I->getParent()->getModule(), Intrinsic::prefetch, PtrValue->getType()); + Builder.CreateCall(PrefetchFunc, {PrefetchPtr, ConstantInt::get(I32, 0), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); + + return true; +} + +PreservedAnalyses PGOPrefetchPass::run(Function &F, + FunctionAnalysisManager &AM) { + PGOPrefetch Prefetcher(F.getContext(), PrefetchHintsFile); + bool Changed = Prefetcher.run(&F); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} + +} // namespace llvm + diff --git a/llvm/lib/Transforms/Utils/AddDiscriminators.cpp b/llvm/lib/Transforms/Utils/AddDiscriminators.cpp index 7d127400651ef4b757d9eac54b12d5e06c5d1b8a..a502fcdedc9d9970ffb21a4c0ea3093b035abef1 100644 --- a/llvm/lib/Transforms/Utils/AddDiscriminators.cpp +++ b/llvm/lib/Transforms/Utils/AddDiscriminators.cpp @@ -85,6 +85,12 @@ static cl::opt NoDiscriminators( "no-discriminators", cl::init(false), cl::desc("Disable generation of discriminator information.")); +// Command line option to toggle discriminator generation for memory +// operations. Used to keep upstream tests happy. +static cl::opt MemOpDiscriminators( + "discriminate-memops", cl::init(true), + cl::desc("Generate unique debug info for each load/store instruction.")); + static bool shouldHaveDiscriminator(const Instruction *I) { return !isa(I) || isa(I); } @@ -210,12 +216,14 @@ static bool addDiscriminators(Function &F) { // a same source line for correct profile annotation. for (BasicBlock &B : F) { LocationSet CallLocations; - for (auto &I : B) { + LocationSet MemoryOpLocations; + for (auto &I : B) { // We bypass intrinsic calls for the following two reasons: // 1) We want to avoid a non-deterministic assignment of // discriminators. // 2) We want to minimize the number of base discriminators used. - if (!isa(I) && (!isa(I) || isa(I))) + if (!isa(I) && (!isa(I) || isa(I)) && + (!MemOpDiscriminators || (!isa(I) && !isa(I)))) continue; DILocation *CurrentDIL = I.getDebugLoc(); @@ -223,7 +231,10 @@ static bool addDiscriminators(Function &F) { continue; Location L = std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine()); - if (!CallLocations.insert(L).second) { + if (((isa(I) || isa(I)) && + !CallLocations.insert(L).second) || + (MemOpDiscriminators && (isa(I) || isa(I)) && + !MemoryOpLocations.insert(L).second)) { unsigned Discriminator = ++LDM[L]; auto NewDIL = CurrentDIL->cloneWithBaseDiscriminator(Discriminator); if (!NewDIL) { diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll index 518653715c83784a24469317ff0dff357179a840..bdf8b2948e04854f3392eed5746e208f81b3202a 100644 --- a/llvm/test/Transforms/AddDiscriminators/basic.ll +++ b/llvm/test/Transforms/AddDiscriminators/basic.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=add-discriminators -S | FileCheck %s +; RUN: opt < %s -passes=add-discriminators -discriminate-memops=0 -S | FileCheck %s ; Basic DWARF discriminator test. All the instructions in block ; 'if.then' should have a different discriminator value than diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll index 7ae9ed0a0116a3eb602c9ef3d6822449362ddb9f..40d6a0bd5c9632a7565ccdc1583aa6555726f8fc 100644 --- a/llvm/test/Transforms/AddDiscriminators/first-only.ll +++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=add-discriminators -S | FileCheck %s +; RUN: opt < %s -passes=add-discriminators -discriminate-memops=0 -S | FileCheck %s ; Test that the only instructions that receive a new discriminator in ; the block 'if.then' are those that share the same line number as diff --git a/llvm/test/Transforms/AddDiscriminators/memops.ll b/llvm/test/Transforms/AddDiscriminators/memops.ll new file mode 100644 index 0000000000000000000000000000000000000000..12a895af8e6f5503e91749e2339bc4c1561e32b8 --- /dev/null +++ b/llvm/test/Transforms/AddDiscriminators/memops.ll @@ -0,0 +1,51 @@ +; RUN: opt < %s -passes=add-discriminators -S | FileCheck %s + +; Check that -discriminate-memops is set by default. +; Check that discriminators are added for loads/stores on the same line: +; +; #1 int A, B, C; +; #2 +; #3 void foo(int cond) { +; #4 C /* discriminator 4 */ = cond ? A /* discriminator 0 */: B /* discriminator 2 */; +; #5 } + +@A = dso_local local_unnamed_addr global i32 0, align 4 +@B = dso_local local_unnamed_addr global i32 0, align 4 +@C = dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn uwtable +define dso_local void @foo(i32 noundef %0) local_unnamed_addr #0 !dbg !9 { + %2 = icmp eq i32 %0, 0, !dbg !12 + %3 = load i32, ptr @A, align 4, !dbg !12 + %4 = load i32, ptr @B, align 4, !dbg !12 + %5 = select i1 %2, i32 %4, i32 %3, !dbg !12 + store i32 %5, ptr @C, align 4, !dbg !13, !tbaa !14 + ret void, !dbg !18 +} + +attribute #0 = { mustprogress nofree norecurse nosync nounwind willreturn uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+87" "tune-cpu"="generic" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3, !4, !5, !6, !7} +!llvm.ident = !{!8} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 15.0.4", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "memops.c", directory: "/srv/workspace/workspace-code/oh/tools/out/llvm_cmake", checksumkind: CSK_MD5, checksum: "5aa69f71aab38096119e5aab0df30495") +!2 = !{i32 7, !"Dwarf Version", i32 5} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !{i32 1, !"wchar_size", i32 4} +!5 = !{i32 7, !"PIC Level", i32 2} +!6 = !{i32 7, !"PIE Level", i32 2} +!7 = !{i32 7, !"uwtable", i32 2} +!8 = !{!"clang version 15.0.4"} +!9 = distinct !DISubprogram(name: "foo", scope: !1, line: 3, type: !10, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !11) +!10 = !DISubroutineType(types: !11) +!11 = !{} +!12 = !DILocation(line: 4, column: 31, scope: !9) +!13 = !DILocation(line: 4, column: 29, scope: !9) +!14 = !{!15, !15, i64 0} +!15 = !{!"int", !16, i64 0 } +!16 = !{!"omnipotent char", !17, i64 0} +!17 = !{!"Simple C/C++ TBAA"} +!18 = !DILocation(line: 5, column: 1, scope: !9) + diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll index 54c1a5d77f102876be79917154f829f39165f851..9902e7d855b35311312a4785e16131818be93ca1 100644 --- a/llvm/test/Transforms/AddDiscriminators/multiple.ll +++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=add-discriminators -S | FileCheck %s +; RUN: opt < %s -passes=add-discriminators -discriminate-memops=0 -S | FileCheck %s ; Discriminator support for multiple CFG paths on the same line. ; diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index 9f45167306904b944806740fdc5e006e381cfdae..5b47ffc9826bd59a0e02f4f97add820c9180c66c 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -329,7 +329,8 @@ PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, } // For perf data input, we need to convert them into perf script first. - if (PerfInput.Format == PerfFormat::PerfData) + if (PerfInput.Format == PerfFormat::PerfData || + PerfInput.Format == PerfFormat::SPEPerfData) PerfInput = PerfScriptReader::convertPerfDataToTrace(Binary, PerfInput, PIDFilter); @@ -343,6 +344,8 @@ PerfReaderBase::create(ProfiledBinary *Binary, PerfInputFile &PerfInput, new HybridPerfReader(Binary, PerfInput.InputFile, PIDFilter)); } else if (PerfInput.Content == PerfContent::LBR) { PerfReader.reset(new LBRPerfReader(Binary, PerfInput.InputFile, PIDFilter)); + } else if (PerfInput.Content == PerfContent::SPE) { + PerfReader.reset(new SPEPerfReader(Binary, PerfInput.InputFile, PIDFilter)); } else { exitWithError("Unsupported perfscript!"); } @@ -394,9 +397,17 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, exitWithError("No relevant mmap event is found in perf data."); } +// const std::string FieldList = +// File.Format == PerfFormat::SPEPerfdata ? "event, " // Run perf script again to retrieve events for PIDs collected above - StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", - "-F", "ip,brstack", "--pid", +// SmallVector ScriptSampleArgs; +// ScriptSampleArgs.push_back(PerfPath); +// ScriptSampleArgs.push_back("script"); +// ScriptSampleArgs.push_back("--show-mmap-events"); +// ScriptSampleArgs.push_back("-F"); + + StringRef ScriptSampleArgs[] = {PerfPath, "script", "--show-mmap-events", + "-F", "event,ip,brstack", "--pid", PIDs, "-i", PerfData}; sys::ExecuteAndWait(PerfPath, ScriptSampleArgs, std::nullopt, Redirects); @@ -886,6 +897,14 @@ void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample, Counter.recordRangeCount(StartAddress, EndAddress, Repeat); EndAddress = SourceAddress; } + +} + +void PerfScriptReader::computeCounterFromSPE(const PerfSample *Sample, + uint64_t Repeat) { + SampleCounter &Counter = SampleCounters.begin()->second; + for (const uint64_t LLCMissAddress : Sample->SPEStack) + Counter.recordSPECount(LLCMissAddr, Repeat); } void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { @@ -898,6 +917,36 @@ void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { } } +void SPEPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) { + std::shared_ptr Sample = std::make_shared(); + SmallVector Records; + TraceIt.getCurrentLine().trim().split(Records, " ", -1, false); +// for (auto Record: Records) +// llvm::errs() << "Record: " << Record << "\n"; +// llvm::errs() << "\n"; + if (Records[0] != "llc-miss:") { + TraceIt.advance(); + return; + } + uint64_t Addr = 0; + if (Records[1].getAsInteger(16, Addr)) { + TraceIt.advance(); + return; + } + Addr = Binary->canonicalizeVirtualAddress(Addr); + if (!Binary->addressIsCode(Addr)) { + TraceIt.advance(); + return; + } + Sample->SPEStack.emplace_back(Addr); + // LLCMissCountMap[Addr]++; +// llvm::errs() << "LLCMissCountMap[" << Addr << "] = " +// << LLCMissCountMap[Addr] << "\n" +// << "Count = " << Count << "\n\n"; + AggregatedSamples[Hashable(Sample)]++; + TraceIt.advance(); +} + void PerfScriptReader::generateUnsymbolizedProfile() { // There is no context for LBR only sample, so initialize one entry with // fake "empty" context key. @@ -909,6 +958,7 @@ void PerfScriptReader::generateUnsymbolizedProfile() { for (const auto &Item : AggregatedSamples) { const PerfSample *Sample = Item.first.getPtr(); computeCounterFromLBR(Sample, Item.second); + computeCounterFromSPE(Sample, Item.second); } } @@ -1023,6 +1073,12 @@ bool PerfScriptReader::isMMap2Event(StringRef Line) { return Line.contains("PERF_RECORD_MMAP2"); } +bool PerfScriptReader::isSPESample(StringRef Line) { + if (Line.empty()) + return false; + return Line.contains("llc-miss:"); +} + // The raw hybird sample is like // e.g. // 4005dc # call stack leaf @@ -1055,6 +1111,9 @@ PerfContent PerfScriptReader::checkPerfScriptType(StringRef FileName) { else return PerfContent::LBR; } + + if (isSPESample(TraceIt.getCurrentLine())) + return PerfContent::SPE; TraceIt.advance(); } } diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h index 14137e82572d77a34d9c07a884762e865e7b4592..38092cf11a6e7a269106a0592300c53e510bf8f4 100644 --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -64,7 +64,7 @@ enum PerfFormat { PerfData = 1, // Raw linux perf.data. PerfScript = 2, // Perf script create by `perf script` command. UnsymbolizedProfile = 3, // Unsymbolized profile generated by llvm-profgen. - + SPEPerfData = 4, // Raw linux perf.data which includes SPE sample. }; // The type of perfscript content. @@ -72,6 +72,7 @@ enum PerfContent { UnknownContent = 0, LBR = 1, // Only LBR sample. LBRStack = 2, // Hybrid sample including call stack and LBR stack. + SPE = 3, // Only SPE sample. }; struct PerfInputFile { @@ -152,6 +153,8 @@ struct PerfSample { // Call stack recorded in FILO(leaf to root) order, it's used for CS-profile // generation SmallVector CallStack; + // SPE stack recordef in FIFO order. + SmallVector SPEStack; virtual ~PerfSample() = default; uint64_t getHashCode() const { @@ -167,15 +170,20 @@ struct PerfSample { Hash = HashCombine(Hash, Entry.Source); Hash = HashCombine(Hash, Entry.Target); } + for (const auto &Value : SPEStack) { + Hash = HashCombine(Hash, Value); + } return Hash; } bool isEqual(const PerfSample *Other) const { const SmallVector &OtherCallStack = Other->CallStack; const SmallVector &OtherLBRStack = Other->LBRStack; + const SmallVector &OtherSPEStack = Other->SPEStack; if (CallStack.size() != OtherCallStack.size() || - LBRStack.size() != OtherLBRStack.size()) + LBRStack.size() != OtherLBRStack.size() || + SPEStack.size() != OtherSPEStack.size()) return false; if (!std::equal(CallStack.begin(), CallStack.end(), OtherCallStack.begin())) @@ -186,6 +194,10 @@ struct PerfSample { LBRStack[I].Target != OtherLBRStack[I].Target) return false; } + + if (!std::equal(SPEStack.begin(), SPEStack.end(), OtherSPEStack.begin())) + return false; + return true; } @@ -400,6 +412,7 @@ using RangeSample = std::map, uint64_t>; struct SampleCounter { RangeSample RangeCounter; BranchSample BranchCounter; + std::unordered_map SPECounter; void recordRangeCount(uint64_t Start, uint64_t End, uint64_t Repeat) { assert(Start <= End && "Invalid instruction range"); @@ -408,6 +421,10 @@ struct SampleCounter { void recordBranchCount(uint64_t Source, uint64_t Target, uint64_t Repeat) { BranchCounter[{Source, Target}] += Repeat; } + void recordSPECount(uint64_t Addr, uint64_t Repeat) { + llvm::errs() << Addr << " " << Repeat << "\n"; + SPECounter[Addr] += Repeat; + } }; // Sample counter with context to support context-sensitive profile @@ -620,6 +637,8 @@ protected: static bool isLBRSample(StringRef Line); // Check whether a given line is MMAP event static bool isMMap2Event(StringRef Line); + // Check whether a given line is SPE sample + static bool isSPESample(StringRef Line); // Parse a single line of a PERF_RECORD_MMAP2 event looking for a // mapping between the binary name and its memory layout. static bool extractMMap2EventForBinary(ProfiledBinary *Binary, StringRef Line, @@ -652,6 +671,7 @@ protected: // repeated. virtual void parseSample(TraceStream &TraceIt, uint64_t Count){}; void computeCounterFromLBR(const PerfSample *Sample, uint64_t Repeat); + void computeCounterFromSPE(const PerfSample *Sample, uint64_t Repeat); // Post process the profile after trace aggregation, we will do simple range // overlap computation for AutoFDO, or unwind for CSSPGO(hybrid sample). virtual void generateUnsymbolizedProfile(); @@ -660,6 +680,8 @@ protected: // Samples with the repeating time generated by the perf reader AggregatedCounter AggregatedSamples; + // Samples + std::unordered_map LLCMissCountMap; // Keep track of all invalid return addresses std::set InvalidReturnAddresses; // PID for the process of interest @@ -736,6 +758,18 @@ private: std::unordered_set ContextStrSet; }; +class SPEPerfReader : public PerfScriptReader { +public: + SPEPerfReader(ProfiledBinary *Binary, StringRef PerfTrace, + std::optional PID) + : PerfScriptReader(Binary, PerfTrace, PID) {}; + // Parse the SPE sample. + void parseSample(TraceStream &TraceIt, uint64_t Count) override; + +// private: +// std::unordered_map LLCMissCountMap; +}; + } // end namespace sampleprof } // end namespace llvm diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp index 97bc8d59b6cd7b25db5f29947e46310caba24982..0149991ef89689b1b229752a0d3012029cbeb50d 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -100,6 +100,15 @@ cl::opt InferMissingFrames( "Infer missing call frames due to compiler tail call elimination."), llvm::cl::Optional); +static cl::opt PrefetchDistance( + "prefetch-distance", cl::value_desc("prefetch-distance"), cl::init(256), + cl::desc("Distance to prefetch ahead, which unit is byte.")); + +static cl::opt TopN( + "top-N", cl::value_desc("top-N"), cl::init(10), + cl::desc("Select N instructions with the most llc miss samples in perf " + "data as prefetch candidates.")); + using namespace llvm; using namespace sampleprof; @@ -128,7 +137,7 @@ ProfileGeneratorBase::create(ProfiledBinary *Binary, } ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator(); FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator(); - + llvm::errs() << "useFSDiscriminator = " << Binary->useFSDiscriminator() << "\n"; return Generator; } @@ -404,6 +413,7 @@ void ProfileGeneratorBase::updateFunctionSamples() { } void ProfileGeneratorBase::collectProfiledFunctions() { + llvm::errs() << "collectProfiledFunctions\n"; std::unordered_set ProfiledFunctions; if (collectFunctionsFromRawProfile(ProfiledFunctions)) Binary->setProfiledFunctions(ProfiledFunctions); @@ -442,6 +452,15 @@ bool ProfileGeneratorBase::collectFunctionsFromRawProfile( if (FuncRange *FRange = Binary->findFuncRange(TargetAddress)) ProfiledFunctions.insert(FRange->Func); } + + for (auto Item : CI.second.SPECounter) { + uint64_t StartAddress = Item.first; + if (FuncRange *FRange = Binary->findFuncRange(StartAddress)) { + ProfiledFunctions.insert(FRange->Func); + llvm::errs() << "[collectFunctionsFromRawProfile]" + << FRange->Func->FuncName << "\n"; + } + } } return true; } @@ -478,7 +497,7 @@ ProfileGenerator::getTopLevelFunctionProfile(StringRef FuncName) { void ProfileGenerator::generateProfile() { collectProfiledFunctions(); - + llvm::errs() << "usePseudoProbes: " << Binary->usePseudoProbes() << "\n"; if (Binary->usePseudoProbes()) Binary->decodePseudoProbe(); @@ -521,10 +540,10 @@ void ProfileGenerator::generateLineNumBasedProfile() { "Must have one entry for profile generation."); const SampleCounter &SC = SampleCounters->begin()->second; // Fill in function body samples - populateBodySamplesForAllFunctions(SC.RangeCounter); + populateBodySamplesForAllFunctions(SC.RangeCounter); // Fill in boundary sample counts as well as call site samples for calls - populateBoundarySamplesForAllFunctions(SC.BranchCounter); - + populateBoundarySamplesForAllFunctions(SC.BranchCounter); + populateSPESamplesForAllFunctions(SC.SPECounter); updateFunctionSamples(); } @@ -596,6 +615,33 @@ void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions( } } +void ProfileGenerator::populateSPESamplesForAllFunctions( + const std::unordered_map &SPECounter) { + std::vector> Vec; + Vec.reserve(SPECounter.size()); + for (const auto &Item : SPECounter) + Vec.emplace_back(Item.first, Item.second); + stable_sort(Vec, + [](const auto &A, const auto &B) {return A.second > B.second;}); + for (size_t i = 0; i < Vec.size() && i < TopN; ++i) { + uint64_t Address = Vec[i].first; + uint64_t Count = Vec[i].second; + + const SampleContextFrameVector FrameVec = + Binary->getCachedFrameLocationStack(Address); + if (!FrameVec.empty()) { + FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples( + FrameVec, 0); + FunctionProfile.addCalledTargetSamples( + FrameVec.back().Location.LineOffset, + getBaseDiscriminator(FrameVec.back().Location.Discriminator), + "__load", PrefetchDistance); + FunctionProfile.addTotalSamples(0); + } + } + +} + FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples( const SampleContextFrameVector &FrameVec, uint64_t Count) { // Get top level profile diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h index 471792ec713cd53f6c35edf4641d4285d7b8d77c..5548bac2e954784f309677c81604cc1e2d3efe6e 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -169,6 +169,8 @@ private: populateBoundarySamplesForAllFunctions(const BranchSample &BranchCounters); void populateBodySamplesWithProbesForAllFunctions(const RangeSample &RangeCounter); + void populateSPESamplesForAllFunctions( + const std::unordered_map &SPECounter); void populateBoundarySamplesWithProbesForAllFunctions( const BranchSample &BranchCounters); void postProcessProfiles(); diff --git a/llvm/tools/llvm-profgen/llvm-profgen.cpp b/llvm/tools/llvm-profgen/llvm-profgen.cpp index 3b974e25103ad4bfc40604a02a96920a5d6571fa..53bb8c0cfa2e9862ce93fed7ff4eb40d94146c0e 100644 --- a/llvm/tools/llvm-profgen/llvm-profgen.cpp +++ b/llvm/tools/llvm-profgen/llvm-profgen.cpp @@ -67,6 +67,12 @@ static cl::opt DebugBinPath( "from it instead of the executable binary."), cl::cat(ProfGenCategory)); +static cl::opt SPEPerfDataFilename( + "spe-perfdata", cl::value_desc("spe-perfdata"), + cl::desc("Path of raw perf data created by Linux perf tool (it should be " + "profiled with -e arm_spe_0)"), + cl::cat(ProfGenCategory)); + extern cl::opt ShowDisassemblyOnly; extern cl::opt ShowSourceLocations; extern cl::opt SkipSymbolization; @@ -84,15 +90,18 @@ static void validateCommandLine() { bool HasUnsymbolizedProfile = UnsymbolizedProfFilename.getNumOccurrences() > 0; bool HasSampleProfile = SampleProfFilename.getNumOccurrences() > 0; + bool HasSPEPerfData = SPEPerfDataFilename.getNumOccurrences() > 0; uint16_t S = - HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile; + HasPerfData + HasPerfScript + HasUnsymbolizedProfile + HasSampleProfile + + HasSPEPerfData; if (S != 1) { std::string Msg = S > 1 - ? "`--perfscript`, `--perfdata` and `--unsymbolized-profile` " - "cannot be used together." + ? "`--perfscript`, `--perfdata`, `--unsymbolized-profile` " + "and `--spe-perfdata` cannot be used together." : "Perf input file is missing, please use one of `--perfscript`, " - "`--perfdata` and `--unsymbolized-profile` for the input."; + "`--perfdata`, `--unsymbolized-profile` and `--spe-perfdata` " + "for the input."; exitWithError(Msg); } @@ -107,6 +116,7 @@ static void validateCommandLine() { CheckFileExists(HasPerfScript, PerfScriptFilename); CheckFileExists(HasUnsymbolizedProfile, UnsymbolizedProfFilename); CheckFileExists(HasSampleProfile, SampleProfFilename); + CheckFileExists(HasSPEPerfData, SPEPerfDataFilename); } if (!llvm::sys::fs::exists(BinaryPath)) { @@ -134,6 +144,9 @@ static PerfInputFile getPerfInputFile() { } else if (UnsymbolizedProfFilename.getNumOccurrences()) { File.InputFile = UnsymbolizedProfFilename; File.Format = PerfFormat::UnsymbolizedProfile; + } else if (SPEPerfDataFilename.getNumOccurrences()) { + File.InputFile = SPEPerfDataFilename; + File.Format = PerfFormat::SPEPerfData; } return File; }