diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 68889bb78233408dee97aa9c841412878ae82486..6ed5ace2ae7d8788cf4baa224a930aede52d3347 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -23,6 +23,7 @@ #include "llvm/ADT/SmallBitVector.h" #include "llvm/IR/FMF.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/PassManager.h" #include "llvm/Pass.h" @@ -1570,6 +1571,9 @@ public: VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const; /// @} + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; + /// @} private: @@ -1927,6 +1931,8 @@ public: Align Alignment) const = 0; virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; + virtual Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const = 0; }; template @@ -2606,6 +2612,11 @@ public: getVPLegalizationStrategy(const VPIntrinsic &PI) const override { return Impl.getVPLegalizationStrategy(PI); } + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const override { + return Impl.computeVectorLength(Builder, AVL, VF); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 480be9f723f23ab9f1e89c21ce7ddbcd2e2432ea..91f2ea473a8a3ebfeb66ae717e28151347d7bb02 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -19,6 +19,7 @@ #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/IR/PatternMatch.h" @@ -862,6 +863,21 @@ public: /* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert); } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index ce1caafb92fb9d14495f0fb57228006860b2d659..243d01e12f14b4de5c70e425ab441b9218c94c7a 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -34,6 +34,7 @@ #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2442,6 +2443,21 @@ public: InstructionCost getVectorSplitCost() { return 1; } + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + Constant *EC = + ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue()); + Value *VLMax = Builder.CreateVScale(EC, "vlmax"); + Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl"); + + return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin, + {VLMax, VL}, nullptr, "evl"); + } + /// @} }; diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h new file mode 100644 index 0000000000000000000000000000000000000000..ce59854dbb95d9886957c9b208c95a5714eb288d --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h @@ -0,0 +1,55 @@ +#ifndef LLVM_TRANSFORMS_VECTORPREDICATION_H +#define LLVM_TRANSFORMS_VECTORPREDICATION_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +using InstToMaskEVLMap = DenseMap>; + +struct BlockData { + // Vector that stores all vector predicated memory writing operations found in + // the basic block. If after phase 1 is empty, then the basic block can be + // skipped by following phases. + SmallVector MemoryWritingVPInstructions; + + // Store all instructions of the basic block (in the same order as they are + // found), assigning to each the list of users. Skip PHIs and terminators. + MapVector> TopologicalGraph; + + // Map each full-length vector operation eligible to be transformed to a + // vector predication one with the (mask,evl) pair of its first vector + // predicated memory writing operation user. + InstToMaskEVLMap VecOpsToTransform; + + // Ordered list representing the reverse order of how the basic block has to + // be transformed due to the new vector predicated instructions. + SmallVector NewBBReverseOrder; + + BlockData() = default; +}; + +class VectorPredicationPass : public PassInfoMixin { +private: + // List of instructions to be replaced by the new VP operations and that later + // should be removed, if possible. + DenseMap OldInstructionsToRemove; + + void analyseBasicBlock(BasicBlock &BB, BlockData &BBInfo); + void findCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + void addNewUsersToMasksAndEVLs(BasicBlock &BB, BlockData &BBInfo); + void buildNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void emitNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo); + void transformCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo); + + void removeOldInstructions(); + +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + static StringRef name() { return "VectorPredicationPass"; } +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORPREDICATION_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index e9c01e68fde2c9adc2729831f70fbf9520f6ed54..6928f2bc0b12a4c0f934d69c9f0916892deec3cc 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1195,6 +1195,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +Value *TargetTransformInfo::computeVectorLength(IRBuilderBase &Builder, + Value *AVL, + ElementCount VF) const { + return TTIImpl->computeVectorLength(Builder, AVL, VF); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 4b8754df7fb6361699385760c36092e88c1df954..5b9f2da07873ee7077ce33e0b92ca22a7030af46 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -256,6 +256,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" #include using namespace llvm; diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp index 6cc66a0cb1320a8f65ad4498190e35ab6418e31b..4423c2c87072f97463d5747abb37da7dead730f1 100644 --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -129,6 +129,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorPredication.h" using namespace llvm; @@ -286,6 +287,11 @@ static cl::opt AttributorRun( clEnumValN(AttributorRunOption::NONE, "none", "disable attributor runs"))); +static cl::opt + EnableVectorPredication("enable-vector-predication", cl::init(false), + cl::Hidden, + cl::desc("Enable VectorPredicationPass.")); + PipelineTuningOptions::PipelineTuningOptions() { LoopInterleaving = true; LoopVectorization = true; @@ -1230,6 +1236,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level, /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); } + // Try to vector predicate vectorized functions. + if (EnableVectorPredication) + FPM.addPass(VectorPredicationPass()); + // Now that we've vectorized and unrolled loops, we may have more refined // alignment information, try to re-derive it here. FPM.addPass(AlignmentFromAssumptionsPass()); diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 73ab87dd8823648fb6c4976ad1a73a3222495a07..f40aedacaa89113784ae9d9f5e1a13673d2cb13b 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -391,6 +391,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) +FUNCTION_PASS("vector-predication", VectorPredicationPass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index fcc88d6d4682c0c4e54f237fec720cd6f0568221..222bc663c62eaede73f1c178943815b63deead0e 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -12,6 +12,7 @@ #include "llvm/CodeGen/BasicTTIImpl.h" #include "llvm/CodeGen/CostTable.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include #include using namespace llvm; @@ -1484,3 +1485,29 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, C2.NumIVMuls, C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } + +Value *RISCVTTIImpl::computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const { + if (!VF.isScalable()) { + return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue()); + } + + const unsigned SEW = 3; // SEW = 64, TODO: we should use ELEN here. + const std::map LMULArgMap = { + {1, 0}, {2, 1}, {4, 2}, {8, 3}}; + + assert(AVL->getType()->isIntegerTy() && + "Requested vector length should be an integer."); + assert(LMULArgMap.find(VF.getKnownMinValue()) != LMULArgMap.end() && + "Invalid value for LMUL argument."); + Value *AVLArg = Builder.CreateZExtOrTrunc(AVL, Builder.getInt64Ty()); + Constant *SEWArg = ConstantInt::get(Builder.getInt64Ty(), SEW); + Constant *LMULArg = ConstantInt::get(Builder.getInt64Ty(), + LMULArgMap.at(VF.getKnownMinValue())); + Value *EVLRes = + Builder.CreateIntrinsic(Intrinsic::riscv_vsetvli, {AVLArg->getType()}, + {AVLArg, SEWArg, LMULArg}, nullptr, "vl"); + + // NOTE: evl type is required to be i32. + return Builder.CreateZExtOrTrunc(EVLRes, Builder.getInt32Ty()); +} diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 143079c470fb97e70ef12b5984f4e490b9c4db1f..3904b6913170abf210c39c408d9790c93ebff333 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -327,6 +327,9 @@ public: bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2); + + Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL, + ElementCount VF) const; }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 998dfd956575d3c1f21d71bdcd333ef20e1b24e4..bc9e4d281638cfd871df66603b4b77470014a951 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize SLPVectorizer.cpp Vectorize.cpp VectorCombine.cpp + VectorPredication.cpp VPlan.cpp VPlanHCFGBuilder.cpp VPlanRecipes.cpp diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 92fb82eea714f12a8c99643f8af8bbd8ec705795..cc723e7129281e868454915e91040bb240b4ab1a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -370,6 +370,10 @@ static cl::opt ForceSafeDivisor( cl::desc( "Override cost based safe divisor widening for div/rem instructions")); +cl::opt UseVectorPredicationIntrinsics( + "use-vp-intrinsics", cl::init(false), cl::Hidden, + cl::desc("Use Vector Predication intrinsics during vectorization.")); + /// A helper function that returns true if the given type is irregular. The /// type is irregular if its allocated size doesn't equal the store size of an /// element of the corresponding vector type. @@ -2890,6 +2894,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { if (VectorTripCount) return VectorTripCount; + // With VP intrinsics, we require tail-folding by masking; this way, we + // operate on a number of elements equal to the original loop trip count. + if (UseVectorPredicationIntrinsics) + return VectorTripCount = getOrCreateTripCount(InsertBlock); + Value *TC = getOrCreateTripCount(InsertBlock); IRBuilder<> Builder(InsertBlock->getTerminator()); @@ -2926,6 +2935,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { // the step does not evenly divide the trip count, no adjustment is necessary // since there will already be scalar iterations. Note that the minimum // iterations check ensures that N >= Step. + // TODO: we should probably honor the cost model also with VP intrinsics. if (Cost->requiresScalarEpilogue(VF)) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); @@ -8189,12 +8199,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, Reverse || Decision == LoopVectorizationCostModel::CM_Widen; if (LoadInst *Load = dyn_cast(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, - Consecutive, Reverse); + return new VPWidenMemoryInstructionRecipe( + *Load, Operands[0], Mask, Plan->getEVLPhi(), Consecutive, Reverse); StoreInst *Store = cast(I); return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], - Mask, Consecutive, Reverse); + Mask, Plan->getEVLPhi(), + Consecutive, Reverse); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -8224,11 +8235,12 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes( vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE); if (auto *TruncI = dyn_cast(PhiOrTrunc)) { return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI, - !NeedsScalarIVOnly); + !NeedsScalarIVOnly, + Plan.getEVLPhi()); } assert(isa(PhiOrTrunc) && "must be a phi node here"); - return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, - !NeedsScalarIVOnly); + return new VPWidenIntOrFpInductionRecipe( + Phi, Start, Step, IndDesc, !NeedsScalarIVOnly, Plan.getEVLPhi()); } VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI( @@ -8698,28 +8710,51 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // Add the necessary canonical IV and branch recipes required to control the // loop. static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - TailFoldingStyle Style) { - Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getOrAddVPValue(StartIdx); + TailFoldingStyle Style, + const TargetTransformInfo *TTI) { + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + + // Add the EVL recipe, used to calculate the correct IV increment. + VPEVLPHIRecipe *EVLRecipe = nullptr; + // TODO: TTI should be able to indicate if a target prefers vector predication + // intrinsics. + if (UseVectorPredicationIntrinsics) { + EVLRecipe = new VPEVLPHIRecipe(Plan.getOrCreateTripCount(), TTI); + Header->insert(EVLRecipe, Header->begin()); + } // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + Value *StartIdx = ConstantInt::get(IdxTy, 0); + auto *StartV = Plan.getOrAddVPValue(StartIdx); auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); - VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); - VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); Header->insert(CanonicalIVPHI, Header->begin()); // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar - // IV by VF * UF. + // IV either by VF * UF or by the EVL values. bool HasNUW = Style == TailFoldingStyle::None; + SmallVector IVOps = {CanonicalIVPHI}; + if (EVLRecipe) + IVOps.push_back(EVLRecipe); auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, - {CanonicalIVPHI}, DL, "index.next"); + IVOps, DL, "index.next"); CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); EB->appendRecipe(CanonicalIVIncrement); + // If we are working with vector predication instrinsics, add a NextEVL + // VPInstruction to calculate the remaining elements number. + if (EVLRecipe) { + auto *NextEVL = + new VPInstruction(VPInstruction::NextEVL, + {EVLRecipe, CanonicalIVIncrement}, DL, "evl.next"); + EVLRecipe->addOperand(NextEVL); + EB->appendRecipe(NextEVL); + } + if (Style == TailFoldingStyle::DataAndControlFlow) { // Create the active lane mask instruction in the vplan preheader. VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); @@ -8866,7 +8901,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(), TTI); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9072,7 +9107,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(), TTI); return Plan; } @@ -9272,24 +9307,27 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { MulOp = Instruction::FMul; } - // Multiply the vectorization factor by the step using integer or - // floating-point arithmetic as appropriate. - Type *StepType = Step->getType(); - Value *RuntimeVF; - if (Step->getType()->isFloatingPointTy()) - RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); - else - RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); - Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); + Value *SplatVF = nullptr; + if (!getEVL()) { + // Multiply the vectorization factor by the step using integer or + // floating-point arithmetic as appropriate. + Type *StepType = Step->getType(); + Value *RuntimeVF; + if (Step->getType()->isFloatingPointTy()) + RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF); + else + RuntimeVF = getRuntimeVF(Builder, StepType, State.VF); + Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF); - // Create a vector splat to use in the induction update. - // - // FIXME: If the step is non-constant, we create the vector splat with - // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't - // handle a constant vector splat. - Value *SplatVF = isa(Mul) - ? ConstantVector::getSplat(State.VF, cast(Mul)) - : Builder.CreateVectorSplat(State.VF, Mul); + // Create a vector splat to use in the induction update. + // + // FIXME: If the step is non-constant, we create the vector splat with + // IRBuilder. IRBuilder can constant-fold the multiply, but it + // doesn't handle a constant vector splat. + SplatVF = isa(Mul) + ? ConstantVector::getSplat(State.VF, cast(Mul)) + : Builder.CreateVectorSplat(State.VF, Mul); + } Builder.restoreIP(CurrIP); // We may need to add the step a number of times, depending on the unroll @@ -9304,8 +9342,26 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { if (isa(EntryVal)) State.addMetadata(LastInduction, EntryVal); - LastInduction = cast( - Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + if (auto *EVLRecipe = getEVL()) { + // Ensure the types match. + Type *DestTy = LastInduction->getType()->getScalarType(); + Value *EVL = State.get(EVLRecipe, Part); + if (DestTy->isIntegerTy()) { + EVL = Builder.CreateZExtOrTrunc(EVL, DestTy); + } else { + assert(DestTy->isFloatingPointTy()); + EVL = Builder.CreateUIToFP(EVL, DestTy); + } + // Multiply the EVL by the step using integer or floating-point + // arithmetic as appropriate. + Value *Mul = Builder.CreateBinOp(MulOp, Step, EVL); + Value *SplatEVL = Builder.CreateVectorSplat(State.VF, Mul); + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatEVL, "step.add.vl")); + } else { + LastInduction = cast( + Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")); + } LastInduction->setDebugLoc(EntryVal->getDebugLoc()); } @@ -9593,9 +9649,15 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); bool isMaskRequired = getMask(); - if (isMaskRequired) + VPValue *VPEVL = getEVL(); + if (isMaskRequired) { for (unsigned Part = 0; Part < State.UF; ++Part) BlockInMaskParts[Part] = State.get(getMask(), Part); + } else if (VPEVL) { + auto *MaskTy = VectorType::get(Builder.getInt1Ty(), State.VF); + for (unsigned Part = 0; Part < State.UF; ++Part) + BlockInMaskParts[Part] = ConstantInt::getTrue(MaskTy); + } const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { // Calculate the pointer for the specific unroll-part. @@ -9633,7 +9695,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { BlockInMaskParts[Part] = Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + Value *Increment = nullptr; + if (VPEVL) { + Increment = Builder.getInt32(0); // EVL is always an i32. + for (unsigned int P = 0; P < Part; P++) + Increment = Builder.CreateAdd(Increment, State.get(VPEVL, P)); + } else { + Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + } PartPtr = cast( Builder.CreateGEP(ScalarDataTy, Ptr, Increment)); PartPtr->setIsInBounds(InBounds); @@ -9651,10 +9720,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (isMaskRequired || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, - MaskPart); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {StoredVal, VectorGep, MaskPart, + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic(Intrinsic::vp_scatter, + {DataTy, PtrsTy}, Operands); + } else { + NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, + MaskPart); + } } else { if (Reverse) { // If we store to reverse consecutive memory locations, then we need @@ -9665,11 +9743,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {StoredVal, VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewSI = Builder.CreateIntrinsic( + Intrinsic::vp_store, {DataTy, VecPtr->getType()}, Operands); + } else if (isMaskRequired) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } State.addMetadata(NewSI, SI); } @@ -9682,21 +9766,37 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { for (unsigned Part = 0; Part < State.UF; ++Part) { Value *NewLI; if (CreateGatherScatter) { - Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + Value *MaskPart = + (isMaskRequired || VPEVL) ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(getAddr(), Part); - NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, - nullptr, "wide.masked.gather"); + if (VPEVL) { + auto *PtrsTy = cast(VectorGep->getType()); + Value *Operands[] = {VectorGep, MaskPart, State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_gather, {DataTy, PtrsTy}, + Operands, nullptr, "vp.gather"); + } else { + NewLI = + Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, + nullptr, "wide.masked.gather"); + } State.addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) + if (VPEVL) { + Value *Operands[] = {VecPtr, BlockInMaskParts[Part], + State.get(VPEVL, Part)}; + NewLI = Builder.CreateIntrinsic(Intrinsic::vp_load, + {DataTy, VecPtr->getType()}, Operands, + nullptr, "vp.load"); + } else if (isMaskRequired) { NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. State.addMetadata(NewLI, LI); @@ -10530,6 +10630,11 @@ LoopVectorizeResult LoopVectorizePass::runImpl( PreservedAnalyses LoopVectorizePass::run(Function &F, FunctionAnalysisManager &AM) { + assert((!UseVectorPredicationIntrinsics || + PreferPredicateOverEpilogue == + PreferPredicateTy::PredicateOrDontVectorize) && + "Tail folding required when using VP intrinsics."); + auto &LI = AM.getResult(F); // There are no loops in the function. Return before computing other expensive // analyses. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d554f438c8040d572239c52196300a4d8f3b7f6f..81e8b52ebb1ff70910360264d3c651ab2c37a05d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -596,6 +596,16 @@ VPlan::~VPlan() { delete P.second; } +VPEVLPHIRecipe *VPlan::getEVLPhi() { + VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : Header->phis()) { + if (isa(&R)) + return cast(&R); + } + + return nullptr; +} + VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() { VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock(); for (VPRecipeBase &R : Header->phis()) { @@ -711,6 +721,13 @@ void VPlan::execute(VPTransformState *State) { } auto *PhiR = cast(&R); + if (auto *EVLPhi = dyn_cast(PhiR)) { + PHINode *Phi = EVLPhi->getPhi(); + Phi->addIncoming(State->get(EVLPhi->getBackedgeValue(), State->UF - 1), + VectorLatchBB); + continue; + } + // For canonical IV, first-order recurrences and in-order reduction phis, // only a single part is generated, which provides the last part from the // previous iteration. For non-ordered reductions all UF parts are diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 986faaf9966426cd5aaaadd22ea24321b469d890..99091246dcda7b9399805792c17c2a2fd554c272 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -719,10 +719,10 @@ public: /// Returns the underlying instruction, if the recipe is a VPValue or nullptr /// otherwise. Instruction *getUnderlyingInstr() { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } const Instruction *getUnderlyingInstr() const { - return cast(getVPSingleValue()->getUnderlyingValue()); + return cast_or_null(getVPSingleValue()->getUnderlyingValue()); } /// Method to support type inquiry through isa, cast, and dyn_cast. @@ -797,7 +797,8 @@ public: CanonicalIVIncrementForPart, CanonicalIVIncrementForPartNUW, BranchOnCount, - BranchOnCond + BranchOnCond, + NextEVL }; private: @@ -1022,20 +1023,30 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue { const InductionDescriptor &IndDesc; bool NeedsVectorIV; + void addEVL(VPValue *EVLRecipe) { + if (EVLRecipe) + addOperand(EVLRecipe); + } + public: VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - bool NeedsVectorIV) + bool NeedsVectorIV, VPValue *EVLRecipe) : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}), VPValue(this, IV), IV(IV), IndDesc(IndDesc), - NeedsVectorIV(NeedsVectorIV) {} + NeedsVectorIV(NeedsVectorIV) { + addEVL(EVLRecipe); + } VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step, const InductionDescriptor &IndDesc, - TruncInst *Trunc, bool NeedsVectorIV) + TruncInst *Trunc, bool NeedsVectorIV, + VPValue *EVLRecipe) : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}), VPValue(this, Trunc), IV(IV), IndDesc(IndDesc), - NeedsVectorIV(NeedsVectorIV) {} + NeedsVectorIV(NeedsVectorIV) { + addEVL(EVLRecipe); + } ~VPWidenIntOrFpInductionRecipe() override = default; @@ -1059,6 +1070,12 @@ public: VPValue *getStepValue() { return getOperand(1); } const VPValue *getStepValue() const { return getOperand(1); } + /// Return the EVL value of the current loop iteration. + VPValue *getEVL() { return getNumOperands() == 3 ? getOperand(2) : nullptr; } + const VPValue *getEVL() const { + return getNumOperands() == 3 ? getOperand(2) : nullptr; + } + /// Returns the first defined value as TruncInst, if it is one or nullptr /// otherwise. TruncInst *getTruncInst() { @@ -1629,8 +1646,8 @@ public: /// A Recipe for widening load/store operations. /// The recipe uses the following VPValues: -/// - For load: Address, optional mask -/// - For store: Address, stored value, optional mask +/// - For load: Address, optional mask, optional evl +/// - For store: Address, stored value, optional mask, optional evl /// TODO: We currently execute only per-part unless a specific instance is /// provided. class VPWidenMemoryInstructionRecipe : public VPRecipeBase { @@ -1642,33 +1659,41 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase { // Whether the consecutive loaded/stored addresses are in reverse order. bool Reverse; - void setMask(VPValue *Mask) { - if (!Mask) - return; - addOperand(Mask); - } + // Whether the instruction has a not all-ones mask. + bool Masked = false; + + // Whether a vector length is available to the instruction. + bool HasVL = false; + + void setMaskAndEVL(VPValue *Mask, VPValue *VPEVL) { + if (Mask) { + this->Masked = true; + addOperand(Mask); + } - bool isMasked() const { - return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; + if (VPEVL) { + this->HasVL = true; + addOperand(VPEVL); + } } public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); new VPValue(this, &Load); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask, - bool Consecutive, bool Reverse) + VPValue *EVL, bool Consecutive, bool Reverse) : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}), Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) { assert((Consecutive || !Reverse) && "Reverse implies consecutive"); - setMask(Mask); + setMaskAndEVL(Mask, EVL); } VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC) @@ -1681,8 +1706,15 @@ public: /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. VPValue *getMask() const { - // Mask is optional and therefore the last operand. - return isMasked() ? getOperand(getNumOperands() - 1) : nullptr; + return Masked ? (HasVL ? getOperand(getNumOperands() - 2) + : getOperand(getNumOperands() - 1)) + : nullptr; + } + + /// Return the evl used by this recipe. If we are working with full-length + /// vectors, return nullptr. + VPValue *getEVL() const { + return HasVL ? getOperand(getNumOperands() - 1) : nullptr; } /// Returns true if this recipe is a store. @@ -1826,6 +1858,33 @@ public: #endif }; +class VPEVLPHIRecipe : public VPHeaderPHIRecipe { + const TargetTransformInfo *TTI; + PHINode *Phi = nullptr; + +public: + VPEVLPHIRecipe(VPValue *StartEVL, const TargetTransformInfo *TTI) + : VPHeaderPHIRecipe(VPDef::VPWidenEVLSC, nullptr, StartEVL), TTI(TTI) {} + + ~VPEVLPHIRecipe() override = default; + + VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC) + + PHINode *getPhi() const { return Phi; } + + static inline bool classof(const VPHeaderPHIRecipe *D) { + return D->getVPDefID() == VPDef::VPWidenEVLSC; + } + + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue { public: @@ -2367,6 +2426,10 @@ public: return cast(&*EntryVPBB->begin()); } + /// Find and return the VPEVLPHIRecipe from the header - there should be only + /// one at most. If there isn't one, then return nullptr. + VPEVLPHIRecipe *getEVLPhi(); + /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there /// be only one at most. If there isn't one, then return nullptr. VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ff0b1df57ce4a4ab8998d1c080b0519fb930b4d9..1b69ac5d3d71a621b059a308cd36605de4773e6d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -35,6 +35,7 @@ using namespace llvm; using VectorParts = SmallVector; extern cl::opt EnableVPlanNativePath; +extern cl::opt UseVectorPredicationIntrinsics; #define LV_NAME "loop-vectorize" #define DEBUG_TYPE LV_NAME @@ -235,6 +236,15 @@ void VPInstruction::generateInstruction(VPTransformState &State, break; } case VPInstruction::ActiveLaneMask: { + if (UseVectorPredicationIntrinsics) { + State.set(this, + ConstantInt::getTrue( + VectorType::get(State.Builder.getInt1Ty(), State.VF)), + Part); + + break; + } + // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); // Get the original loop tripcount. @@ -279,10 +289,21 @@ void VPInstruction::generateInstruction(VPTransformState &State, if (Part == 0) { bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW; auto *Phi = State.get(getOperand(0), 0); - // The loop step is equal to the vectorization factor (num of SIMD - // elements) times the unroll factor (num of SIMD instructions). - Value *Step = - createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + Value *Step = nullptr; + if (getNumOperands() == 2) { + // We have the EVL value available to use. + VPValue *VPEVL = getOperand(1); + Step = State.get(VPEVL, 0); + for (unsigned P = 1; P < State.UF; P++) + Step = Builder.CreateAdd(Step, State.get(VPEVL, P)); + + Step = Builder.CreateZExtOrTrunc(Step, Phi->getType()); + } else { + // The loop step is equal to the vectorization factor (num of SIMD + // elements) times the unroll factor (num of SIMD instructions). + Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF); + } + Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false); } else { Next = State.get(this, 0); @@ -353,6 +374,21 @@ void VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); break; } + case VPInstruction::NextEVL: { + Value *Next = nullptr; + if (Part == 0) { + auto *EVLRecipe = cast(getOperand(0)); + Value *StartEVL = State.get(EVLRecipe->getOperand(0), 0); + Value *IVIncrement = State.get(getOperand(1), 0); + + Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next"); + } else { + Next = State.get(this, 0); + } + + State.set(this, Next, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -719,6 +755,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, #endif bool VPWidenIntOrFpInductionRecipe::isCanonical() const { + if (getEVL()) + return false; + auto *StartC = dyn_cast(getStartValue()->getLiveInIRValue()); auto *StepC = dyn_cast(getInductionDescriptor().getStep()); return StartC && StartC->isZero() && StepC && StepC->isOne(); @@ -1329,3 +1368,30 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } #endif + +void VPEVLPHIRecipe::execute(VPTransformState &State) { + Value *StartEVL = State.get(getOperand(0), 0); + BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this); + this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi"); + this->Phi->addIncoming(StartEVL, VectorPH); + + Value *PrevEVL = State.Builder.CreateZExtOrTrunc( + cast(this->Phi), State.Builder.getInt32Ty(), "evl.phi.cast"); + Value *EVL = nullptr; + for (unsigned Part = 0; Part < State.UF; Part++) { + if (EVL) + PrevEVL = State.Builder.CreateSub(PrevEVL, EVL); + EVL = TTI->computeVectorLength(State.Builder, PrevEVL, State.VF); + State.set(this, EVL, Part); + } +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPEVLPHIRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EVL-PHI "; + printAsOperand(O, SlotTracker); + O << " = phi "; + printOperands(O, SlotTracker); +} +#endif diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 1cfba64f1fbefa957bcfa381986e137f971eb225..5070aa9a8dff157a4f40ac9b885d3e83ad7cd836 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -55,8 +55,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes( VPValue *Start = Plan->getOrAddVPValue(II->getStartValue()); VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE); - NewRecipe = - new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true); + NewRecipe = new VPWidenIntOrFpInductionRecipe( + Phi, Start, Step, *II, true, Plan->getEVLPhi()); } else { Plan->addVPValue(Phi, VPPhi); continue; @@ -69,12 +69,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes( if (LoadInst *Load = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), - nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/); + nullptr /*Mask*/, nullptr /*EVL*/, false /*Consecutive*/, + false /*Reverse*/); } else if (StoreInst *Store = dyn_cast(Inst)) { NewRecipe = new VPWidenMemoryInstructionRecipe( *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)), Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/, - false /*Consecutive*/, false /*Reverse*/); + nullptr /*EVL*/, false /*Consecutive*/, false /*Reverse*/); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe( GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 62ec65cbfe5dd6241c0c3d1f26cd2da86d0cf531..994a677a5dba663a3531bcd9e93eb0ccb7b30276 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -354,6 +354,7 @@ public: VPCanonicalIVPHISC, VPActiveLaneMaskPHISC, VPFirstOrderRecurrencePHISC, + VPWidenEVLSC, VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, diff --git a/llvm/lib/Transforms/Vectorize/VectorPredication.cpp b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cc6137a134d5ca291e31196ab0438d09c205a05e --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp @@ -0,0 +1,277 @@ +#include "llvm/Transforms/Vectorize/VectorPredication.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/VectorBuilder.h" +#include "llvm/Transforms/Utils/Local.h" + +#define DEBUG_TYPE "vector-predication" +STATISTIC(Transforms, "Number of full-length -> evl vector transformation."); + +using namespace llvm; + +// Map each instruction to its uses and save all memory writing vector +// predicated instructions found in the basic block. +void VectorPredicationPass::analyseBasicBlock(BasicBlock &BB, + BlockData &BBInfo) { + for (Instruction &I : BB) { + if (isa(I) || I.isTerminator()) + continue; + + SmallPtrSet IUsers; + for (User *IU : I.users()) { + assert(isa(IU) && "Unexpected behaviour."); + auto *IUInst = cast(IU); + if (IUInst->getParent() != I.getParent()) + continue; + if (isa(IUInst) || IUInst->isTerminator()) + continue; + + IUsers.insert(IUInst); + } + BBInfo.TopologicalGraph.insert({&I, IUsers}); + + if (auto *CI = dyn_cast(&I)) { + if (auto *CF = CI->getCalledFunction()) { + Intrinsic::ID ID = CF->getIntrinsicID(); + if (ID == Intrinsic::vp_store || ID == Intrinsic::vp_scatter) { + BBInfo.MemoryWritingVPInstructions.push_back(&I); + } + } + } + } +} + +static void findCandidateVectorOperation(BasicBlock &BB, Value *Op, Value *Mask, + Value *EVL, + InstToMaskEVLMap &VecOpsToTransform) { + auto *OpInst = dyn_cast(Op); + if (!OpInst) + return; + + if (OpInst->getParent() != &BB) + return; + + Intrinsic::ID VPID = VPIntrinsic::getForOpcode(OpInst->getOpcode()); + if (VPID == Intrinsic::not_intrinsic) + return; + + // If the instruction is already present in the map, it means it was already + // visited starting from a previous memory wrtiting vp operation. + if (!VecOpsToTransform + .insert(std::make_pair(OpInst, std::make_pair(Mask, EVL))) + .second) { + // We need to check if new mask and evl values differ from the old ones: + // - if they are the same, then there is nothing to do; + // - if only the mask differ, we use an allones mask; + // - otherwise, we remove the instruction from the map (i.e., no + // transformation should happen) + auto It = VecOpsToTransform.find(OpInst); + assert(It != VecOpsToTransform.end()); + Value *OldMask, *OldEVL; + std::tie(OldMask, OldEVL) = It->second; + + if (Mask == OldMask && EVL == OldEVL) + return; + + VecOpsToTransform.erase(OpInst); + if (EVL == OldEVL) { + VecOpsToTransform.insert( + std::make_pair(OpInst, std::make_pair(nullptr, EVL))); + } + } + + // Recursively visit OpInst operands. + switch (VPID) { + default: + for (auto *OpVal : OpInst->operand_values()) + findCandidateVectorOperation(BB, OpVal, Mask, EVL, VecOpsToTransform); + break; + case Intrinsic::vp_select: { + Value *Cond = OpInst->getOperand(0); + if (Cond->getType()->isVectorTy()) + findCandidateVectorOperation(BB, Cond, nullptr, EVL, VecOpsToTransform); + + // TODO: if the condition argument is a vector, we could backpropagate it + // as mask for the true branch and its negation as mask for the false one. + // WARNING: when creating the negation of the condition, we must ensure it + // dominates all uses. + findCandidateVectorOperation(BB, OpInst->getOperand(1), nullptr, EVL, + VecOpsToTransform); + findCandidateVectorOperation(BB, OpInst->getOperand(2), nullptr, EVL, + VecOpsToTransform); + break; + } + } +} + +// For each vector predicated memory writing operation of the basic block, go +// back to the stored vector defining instruction and verify it is a vector +// operation. Add it to the list of instructions to be transformed into vector +// predicated ones, then recursively repeat the process for its vector +// arguments. +void VectorPredicationPass::findCandidateVectorOperations(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.MemoryWritingVPInstructions.empty()) + return; + + for (Instruction *I : BBInfo.MemoryWritingVPInstructions) { + assert(I->getParent() == &BB && "This is not the right basic block"); + auto *VPI = cast(I); + Value *StoredOperand = VPI->getMemoryDataParam(); + Value *MaskOperand = VPI->getMaskParam(); + Value *EVLOperand = VPI->getVectorLengthParam(); + // First, visit the mask operand (assigning an allones mask to this branch) + // and only then visit the stored operand. + findCandidateVectorOperation(BB, MaskOperand, nullptr, EVLOperand, + BBInfo.VecOpsToTransform); + findCandidateVectorOperation(BB, StoredOperand, MaskOperand, EVLOperand, + BBInfo.VecOpsToTransform); + } +} + +// Add the candidates as users of the mask and evl linked to each of them. +void VectorPredicationPass::addNewUsersToMasksAndEVLs(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [K, V] : BBInfo.VecOpsToTransform) { + if (auto *MaskInst = dyn_cast_if_present(V.first)) + BBInfo.TopologicalGraph[MaskInst].insert(K); + if (auto *EVLInst = dyn_cast(V.second)) + BBInfo.TopologicalGraph[EVLInst].insert(K); + } +} + +// Topologically sort, preserving as much as possible the original order. +void VectorPredicationPass::buildNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + while (!BBInfo.TopologicalGraph.empty()) { + Instruction *Inst = nullptr; + for (auto B = BBInfo.TopologicalGraph.rbegin(), + E = BBInfo.TopologicalGraph.rend(); + B != E; B++) { + if (B->second.empty()) { + Inst = B->first; + break; + } + } + assert(Inst && "Failed to empty topological graph!"); + + BBInfo.NewBBReverseOrder.push_back(Inst); + BBInfo.TopologicalGraph.erase(Inst); + + for (auto B = BBInfo.TopologicalGraph.begin(), + E = BBInfo.TopologicalGraph.end(); + B != E; B++) { + B->second.erase(Inst); + } + } +} + +// Modify the basic block based on the topological order generated. +void VectorPredicationPass::emitNewBasicBlockSchedule(BasicBlock &BB, + BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + Instruction *InsertPoint = BB.getTerminator(); + for (Instruction *I : BBInfo.NewBBReverseOrder) { + I->moveBefore(InsertPoint); + InsertPoint = I; + } +} + +// Transform candidates to vector predicated instructions. +void VectorPredicationPass::transformCandidateVectorOperations( + BasicBlock &BB, BlockData &BBInfo) { + if (BBInfo.VecOpsToTransform.empty()) + return; + + for (auto [I, P] : BBInfo.VecOpsToTransform) { + Value *Mask, *EVL; + std::tie(Mask, EVL) = P; + + IRBuilder<> Builder(I); + unsigned int Opcode = I->getOpcode(); + Type *RetTy = I->getType(); + SmallVector Operands(I->value_op_begin(), I->value_op_end()); + switch (Opcode) { + case Instruction::FCmp: + case Instruction::ICmp: { + Operands.clear(); + auto *CmpI = cast(I); + Value *PredOp = MetadataAsValue::get( + Builder.getContext(), + MDString::get(Builder.getContext(), + CmpInst::getPredicateName(CmpI->getPredicate()))); + Operands = {CmpI->getOperand(0), CmpI->getOperand(1), PredOp}; + break; + } + case Instruction::Select: { + if (!I->getOperand(0)->getType()->isVectorTy()) { + Operands.clear(); + Value *Op1 = I->getOperand(1); + Value *Op2 = I->getOperand(2); + Value *Cond = Builder.CreateVectorSplat( + cast(Op1->getType())->getElementCount(), + I->getOperand(0), "select.cond.splat"); + Operands = {Cond, Op1, Op2}; + } + break; + } + default: + break; + } + + if (!Mask) + // nullptr means unmasked operation, hence we use an all-ones mask. + Mask = ConstantInt::getTrue(RetTy->getWithNewType(Builder.getInt1Ty())); + + VectorBuilder VecBuilder(Builder); + VecBuilder.setMask(Mask).setEVL(EVL); + Value *NewVPOp = + VecBuilder.createVectorInstruction(Opcode, RetTy, Operands, "vp.op"); + + Transforms++; // Stats + OldInstructionsToRemove.insert(std::make_pair(I, NewVPOp)); + } +} + +// Remove old instructions, if possible. +void VectorPredicationPass::removeOldInstructions() { + for (auto [I, NewVPOp] : OldInstructionsToRemove) { + I->replaceAllUsesWith(NewVPOp); + if (isInstructionTriviallyDead(I)) + I->eraseFromParent(); + } +} + +PreservedAnalyses VectorPredicationPass::run(Function &F, + FunctionAnalysisManager &AM) { + assert(OldInstructionsToRemove.empty() && + "Map should be cleared at the end of each run of the pass."); + + for (BasicBlock &BB : F) { + BlockData BBInfo; + + analyseBasicBlock(BB, BBInfo); + findCandidateVectorOperations(BB, BBInfo); + addNewUsersToMasksAndEVLs(BB, BBInfo); + buildNewBasicBlockSchedule(BB, BBInfo); + emitNewBasicBlockSchedule(BB, BBInfo); + transformCandidateVectorOperations(BB, BBInfo); + } + + removeOldInstructions(); + OldInstructionsToRemove.clear(); + + // TODO: think about which analysis are preserved. + return PreservedAnalyses::none(); +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll new file mode 100644 index 0000000000000000000000000000000000000000..ae636428f935698595b718e689cd34591d705df2 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll @@ -0,0 +1,134 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=loop-vectorize -use-vp-intrinsics -prefer-predicate-over-epilogue=predicate-dont-vectorize -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP4]], 8 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C1]], [[B3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: [[TMP13:%.*]] = add zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT]], [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP15]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i32 0 +; CHECK-NEXT: [[VP_LOAD5:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP18:%.*]] = fadd [[VP_LOAD]], [[VP_LOAD5]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0 +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[TMP18]], ptr [[TMP20]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP10]] to i64 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP23:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP24:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP23]], [[TMP24]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %0 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %1 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !8 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9} +!9 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else.ll b/llvm/test/Transforms/VectorPredication/if-elif-else.ll new file mode 100644 index 0000000000000000000000000000000000000000..761d3bfe9d0bf0763715a427b092bdfc5bb56d30 --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-elif-else.ll @@ -0,0 +1,270 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-elif-else.c' +source_filename = "custom/if-elif-else.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else if (N > 75) +; C[I] = A[I] * B[I]; +; else +; C[I] = 2 * A[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP30:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP30]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[CMP4:%.*]] = icmp ugt i64 [[N]], 75 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[N]], 3 +; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP32:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[UGLYGEP33:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP32]], [[C]] +; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]] +; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; CHECK-NEXT: [[BOUND034:%.*]] = icmp ugt ptr [[UGLYGEP33]], [[C]] +; CHECK-NEXT: [[BOUND135:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]] +; CHECK-NEXT: [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT36]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT37:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT38:%.*]] = shufflevector [[BROADCAST_SPLATINSERT37]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT39:%.*]] = insertelement poison, i1 [[CMP4]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT40:%.*]] = shufflevector [[BROADCAST_SPLATINSERT39]], poison, zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = xor [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP6:%.*]] = select [[TMP5]], [[BROADCAST_SPLAT40]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = select [[BROADCAST_SPLAT38]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), [[BROADCAST_SPLAT40]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP8]], i64 3, i64 0) +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP10]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4:![0-9]+]], !alias.scope !8 +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD41:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[TMP6]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD41]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_LOAD42:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], [[BROADCAST_SPLAT38]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11 +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD42]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP7]], [[VP_OP2]], [[VP_OP1]], i32 [[TMP9]]) +; CHECK-NEXT: [[VP_OP4:%.*]] = call @llvm.vp.select.nxv1f64( [[TMP6]], [[VP_OP3]], [[VP_OP]], i32 [[TMP9]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP4]], ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !13, !noalias !15 +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP13]], label [[FOR_END_LOOPEXIT44:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_031:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_031]] +; CHECK-NEXT: [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: br i1 [[CMP4]], label [[IF_THEN5:%.*]], label [[IF_ELSE9:%.*]] +; CHECK: if.then5: +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP14]], [[TMP16]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else9: +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP14]], 2.000000e+00 +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL11]], [[IF_ELSE9]] ], [ [[MUL]], [[IF_THEN5]] ] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_031]] +; CHECK-NEXT: store double [[ADD_SINK]], ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_031]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit44: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %cmp30 = icmp sgt i64 %N, 0 + br i1 %cmp30, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %cmp4 = icmp ugt i64 %N, 75 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = shl i64 %N, 3 + %uglygep = getelementptr i8, ptr %C, i64 %4 + %uglygep32 = getelementptr i8, ptr %A, i64 %4 + %uglygep33 = getelementptr i8, ptr %B, i64 %4 + %bound0 = icmp ugt ptr %uglygep32, %C + %bound1 = icmp ugt ptr %uglygep, %A + %found.conflict = and i1 %bound0, %bound1 + %bound034 = icmp ugt ptr %uglygep33, %C + %bound135 = icmp ugt ptr %uglygep, %B + %found.conflict36 = and i1 %bound034, %bound135 + %conflict.rdx = or i1 %found.conflict, %found.conflict36 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %broadcast.splatinsert37 = insertelement poison, i1 %cmp1, i64 0 + %broadcast.splat38 = shufflevector %broadcast.splatinsert37, poison, zeroinitializer + %broadcast.splatinsert39 = insertelement poison, i1 %cmp4, i64 0 + %broadcast.splat40 = shufflevector %broadcast.splatinsert39, poison, zeroinitializer + %5 = xor %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) + %6 = select %5, %broadcast.splat40, zeroinitializer + %7 = select %broadcast.splat38, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), %broadcast.splat40 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %8 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %8, i64 3, i64 0) + %9 = trunc i64 %vl to i32 + %10 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %10, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !8 + %11 = fmul %vp.load, shufflevector ( insertelement ( poison, double 2.000000e+00, i64 0), poison, zeroinitializer) + %12 = getelementptr double, ptr %B, i64 %index + %vp.load41 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %6, i32 %9), !tbaa !4, !alias.scope !11 + %13 = fmul %vp.load, %vp.load41 + %vp.load42 = call @llvm.vp.load.nxv1f64.p0(ptr %12, %broadcast.splat38, i32 %9), !tbaa !4, !alias.scope !11 + %14 = fadd %vp.load, %vp.load42 + %predphi = select %7, %14, %11 + %predphi43 = select %6, %13, %predphi + %15 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %predphi43, ptr %15, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %9), !tbaa !4, !alias.scope !13, !noalias !15 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %16 = icmp eq i64 %index.next, %N + br i1 %16, label %for.end.loopexit44, label %vector.body, !llvm.loop !16 + +for.body: ; preds = %for.body.preheader, %for.inc + %I.031 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.031 + %17 = load double, ptr %arrayidx, align 8, !tbaa !4 + br i1 %cmp1, label %if.then, label %if.else + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.031 + %18 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %17, %18 + br label %for.inc + +if.else: ; preds = %for.body + br i1 %cmp4, label %if.then5, label %if.else9 + +if.then5: ; preds = %if.else + %arrayidx7 = getelementptr inbounds double, ptr %B, i64 %I.031 + %19 = load double, ptr %arrayidx7, align 8, !tbaa !4 + %mul = fmul double %17, %19 + br label %for.inc + +if.else9: ; preds = %if.else + %mul11 = fmul double %17, 2.000000e+00 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else9, %if.then5 + %add.sink = phi double [ %add, %if.then ], [ %mul11, %if.else9 ], [ %mul, %if.then5 ] + %arrayidx3 = getelementptr inbounds double, ptr %C, i64 %I.031 + store double %add.sink, ptr %arrayidx3, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.031, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !20 + +for.end.loopexit: ; preds = %for.inc + br label %for.end + +for.end.loopexit44: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit44, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = !{!9} +!9 = distinct !{!9, !10} +!10 = distinct !{!10, !"LVerDomain"} +!11 = !{!12} +!12 = distinct !{!12, !10} +!13 = !{!14} +!14 = distinct !{!14, !10} +!15 = !{!9, !12} +!16 = distinct !{!16, !17, !18, !19} +!17 = !{!"llvm.loop.mustprogress"} +!18 = !{!"llvm.loop.isvectorized", i32 1} +!19 = !{!"llvm.loop.unroll.runtime.disable"} +!20 = distinct !{!20, !17, !18} diff --git a/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll new file mode 100644 index 0000000000000000000000000000000000000000..ed8f28feeffc5d53d069ef125bc19f67b4651b1a --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else2.c' +source_filename = "custom/if-else2.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (N < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[N]], 50 +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLATINSERT:%.*]] = insertelement poison, i1 [[CMP1]], i64 0 +; CHECK-NEXT: [[SELECT_COND_SPLAT_SPLAT:%.*]] = shufflevector [[SELECT_COND_SPLAT_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.select.nxv1f64( [[SELECT_COND_SPLAT_SPLAT]], [[VP_OP]], [[VP_OP2]], i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP1]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT25:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP17]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit25: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %cmp1 = icmp ult i64 %N, 50 + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader, label %vector.memcheck + +for.body.preheader: ; preds = %vector.memcheck, %for.body.lr.ph + br label %for.body + +vector.memcheck: ; preds = %for.body.lr.ph + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load24 + %14 = fmul %vp.load, %vp.load24 + %15 = select i1 %cmp1, %13, %14 + %16 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %15, ptr %16, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %17 = icmp eq i64 %index.next, %N + br i1 %17, label %for.end.loopexit25, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %18 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %19 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %18, %19 + %mul = fmul double %18, %19 + %mul.sink = select i1 %cmp1, double %add, double %mul + %20 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %20, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit25: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit25, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll new file mode 100644 index 0000000000000000000000000000000000000000..9c25aec38fdb86e7b389b4783af81af1c66437ab --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll @@ -0,0 +1,219 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/if-else1.c' +source_filename = "custom/if-else1.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) { +; if (I < 50) +; C[I] = A[I] + B[I]; +; else +; C[I] = A[I] * B[I]; +; } +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER25:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader25: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C20]], [[A21]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C20]], [[B22]] +; CHECK-NEXT: [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER25]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.experimental.stepvector.nxv1i64() +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP10:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP10]], i64 3, i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[VL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[VP_OP1:%.*]] = call @llvm.vp.icmp.nxv1i64( [[VEC_IND]], shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer), metadata !"ult", shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD24:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP3:%.*]] = call @llvm.vp.fmul.nxv1f64( [[VP_LOAD]], [[VP_LOAD24]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[VP_OP2:%.*]] = call @llvm.vp.select.nxv1f64( [[VP_OP1]], [[VP_OP]], [[VP_OP3]], i32 [[TMP11]]) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP2]], ptr [[TMP14]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP11]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_END_LOOPEXIT26:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER25]] ] +; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[I_019]], 50 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]] +; CHECK-NEXT: [[TMP17:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]] +; CHECK-NEXT: store double [[MUL_SINK]], ptr [[TMP18]], align 8 +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_019]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit26: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B22 = ptrtoint ptr %B to i64 + %A21 = ptrtoint ptr %A to i64 + %C20 = ptrtoint ptr %C to i64 + %cmp18 = icmp sgt i64 %N, 0 + br i1 %cmp18, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 8) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader25, label %vector.memcheck + +for.body.preheader25: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C20, %A21 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C20, %B22 + %diff.check23 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check23 + br i1 %conflict.rdx, label %for.body.preheader25, label %vector.ph + +vector.ph: ; preds = %vector.memcheck + %9 = call @llvm.experimental.stepvector.nxv1i64() + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ] + %vec.ind = phi [ %9, %vector.ph ], [ %vec.ind.next, %vector.body ] + %10 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %10, i64 3, i64 0) + %11 = trunc i64 %vl to i32 + %.splatinsert = insertelement poison, i64 %vl, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %12 = icmp ult %vec.ind, shufflevector ( insertelement ( poison, i64 50, i64 0), poison, zeroinitializer) + %13 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %13, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %14 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load24 = call @llvm.vp.load.nxv1f64.p0(ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11), !tbaa !4 + %15 = fadd %vp.load, %vp.load24 + %16 = fmul %vp.load, %vp.load24 + %17 = select %12, %15, %16 + %18 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %17, ptr %18, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %11) + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %vec.ind.next = add %vec.ind, %.splat + %19 = icmp eq i64 %index.next, %N + br i1 %19, label %for.end.loopexit26, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader25, %for.body + %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader25 ] + %cmp1 = icmp ult i64 %I.019, 50 + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019 + %20 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019 + %21 = load double, ptr %arrayidx2, align 8, !tbaa !4 + %add = fadd double %20, %21 + %mul = fmul double %20, %21 + %mul.sink = select i1 %cmp1, double %add, double %mul + %22 = getelementptr inbounds double, ptr %C, i64 %I.019 + store double %mul.sink, ptr %22, align 8 + %inc = add nuw nsw i64 %I.019, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit26: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit26, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll new file mode 100644 index 0000000000000000000000000000000000000000..116d883572eeb2f08b1956a7f9f80e34e61c3e60 --- /dev/null +++ b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s + +; ModuleID = 'custom/simple.c' +source_filename = "custom/simple.c" +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-unknown" + +; Input C code: +; void addVec(long N, double *C, double *A, double *B) { +; long I; +; for (I = 0; I < N; I++) +; C[I] = A[I] + B[I]; +; } + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) +define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 { +; CHECK-LABEL: @addVec( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B11:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A10:%.*]] = ptrtoint ptr [[A:%.*]] to i64 +; CHECK-NEXT: [[C9:%.*]] = ptrtoint ptr [[C:%.*]] to i64 +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = xor i64 [[N]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10) +; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_BODY_PREHEADER14:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; CHECK: for.body.preheader14: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[C9]], [[A10]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[C9]], [[B11]] +; CHECK-NEXT: [[DIFF_CHECK12:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK12]] +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER14]], label [[VECTOR_BODY_PREHEADER:%.*]] +; CHECK: vector.body.preheader: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295 +; CHECK-NEXT: [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[VL]] to i32 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[VP_LOAD13:%.*]] = call @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[VP_OP:%.*]] = call @llvm.vp.fadd.nxv1f64( [[VP_LOAD]], [[VP_LOAD13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]) +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.vp.store.nxv1f64.p0( [[VP_OP]], ptr [[TMP13]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]] +; CHECK-NEXT: [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[TMP14]], label [[FOR_END_LOOPEXIT15:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER14]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]] +; CHECK-NEXT: [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]] +; CHECK-NEXT: [[TMP16:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]] +; CHECK-NEXT: store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]] +; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end.loopexit15: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + %B11 = ptrtoint ptr %B to i64 + %A10 = ptrtoint ptr %A to i64 + %C9 = ptrtoint ptr %C to i64 + %cmp7 = icmp sgt i64 %N, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + +for.body.preheader: ; preds = %entry + %0 = xor i64 %N, -1 + %1 = call i64 @llvm.vscale.i64() + %2 = call i64 @llvm.umax.i64(i64 %1, i64 10) + %3 = icmp ugt i64 %2, %0 + br i1 %3, label %for.body.preheader14, label %vector.memcheck + +for.body.preheader14: ; preds = %vector.memcheck, %for.body.preheader + br label %for.body + +vector.memcheck: ; preds = %for.body.preheader + %4 = call i64 @llvm.vscale.i64() + %5 = shl nuw nsw i64 %4, 3 + %6 = sub i64 %C9, %A10 + %diff.check = icmp ult i64 %6, %5 + %7 = shl nuw nsw i64 %4, 3 + %8 = sub i64 %C9, %B11 + %diff.check12 = icmp ult i64 %8, %7 + %conflict.rdx = or i1 %diff.check, %diff.check12 + br i1 %conflict.rdx, label %for.body.preheader14, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.memcheck + br label %vector.body + +vector.body: ; preds = %vector.body.preheader, %vector.body + %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ] + %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ] + %9 = and i64 %evl.phi, 4294967295 + %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0) + %10 = trunc i64 %vl to i32 + %11 = getelementptr inbounds double, ptr %A, i64 %index + %vp.load = call @llvm.vp.load.nxv1f64.p0(ptr %11, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %12 = getelementptr inbounds double, ptr %B, i64 %index + %vp.load13 = call @llvm.vp.load.nxv1f64.p0(ptr %12, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %13 = fadd %vp.load, %vp.load13 + %14 = getelementptr inbounds double, ptr %C, i64 %index + call void @llvm.vp.store.nxv1f64.p0( %13, ptr %14, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), i32 %10), !tbaa !4 + %index.next = add i64 %index, %vl + %evl.next = sub i64 %N, %index.next + %15 = icmp eq i64 %index.next, %N + br i1 %15, label %for.end.loopexit15, label %vector.body, !llvm.loop !8 + +for.body: ; preds = %for.body.preheader14, %for.body + %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader14 ] + %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08 + %16 = load double, ptr %arrayidx, align 8, !tbaa !4 + %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08 + %17 = load double, ptr %arrayidx1, align 8, !tbaa !4 + %add = fadd double %16, %17 + %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08 + store double %add, ptr %arrayidx2, align 8, !tbaa !4 + %inc = add nuw nsw i64 %I.08, 1 + %exitcond.not = icmp eq i64 %inc, %N + br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12 + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end.loopexit15: ; preds = %vector.body + br label %for.end + +for.end: ; preds = %for.end.loopexit15, %for.end.loopexit, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare i64 @llvm.umax.i64(i64, i64) #2 + +; Function Attrs: nounwind memory(none) +declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv1i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read) +declare @llvm.vp.load.nxv1f64.p0(ptr nocapture, , i32) #4 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.vp.store.nxv1f64.p0(, ptr nocapture, , i32) #5 + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +attributes #3 = { nounwind memory(none) } +attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) } +attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2} +!llvm.ident = !{!3} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"SmallDataLimit", i32 8} +!3 = !{!"clang version 17.0.0"} +!4 = !{!5, !5, i64 0} +!5 = !{!"double", !6, i64 0} +!6 = !{!"omnipotent char", !7, i64 0} +!7 = !{!"Simple C/C++ TBAA"} +!8 = distinct !{!8, !9, !10, !11} +!9 = !{!"llvm.loop.mustprogress"} +!10 = !{!"llvm.loop.isvectorized", i32 1} +!11 = !{!"llvm.loop.unroll.runtime.disable"} +!12 = distinct !{!12, !9, !10} diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index ff7ee53bfbcf037c3135c1aaf8a921179938853e..14462f0ef6b24e99bed9254569fe8e4a66c36341 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1003,7 +1003,8 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_TRUE(isa(&Recipe)); VPRecipeBase *BaseR = &Recipe; EXPECT_TRUE(isa(BaseR)); @@ -1099,7 +1100,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1)); VPValue Addr; VPValue Mask; - VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false); + VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true, + false); EXPECT_FALSE(Recipe.mayHaveSideEffects()); EXPECT_TRUE(Recipe.mayReadFromMemory()); EXPECT_FALSE(Recipe.mayWriteToMemory()); @@ -1113,8 +1115,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { VPValue Addr; VPValue Mask; VPValue StoredV; - VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, - false); + VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, + nullptr, false, false); EXPECT_TRUE(Recipe.mayHaveSideEffects()); EXPECT_FALSE(Recipe.mayReadFromMemory()); EXPECT_TRUE(Recipe.mayWriteToMemory());