From 9f452a8ad6a6941725e3d852af23223d8ee07adf Mon Sep 17 00:00:00 2001
From: Lorenzo Albano <lorenzo.albano@bsc.es>
Date: Wed, 15 Feb 2023 10:25:01 +0000
Subject: [PATCH 1/2] Enable the use of Vector Predication intrinsics in the
 loop vectorizer.

Add new VP Recipes for the Explicit Vector Length (EVL) and add support
for VP memory intrinsics (vp.load, vp.store, vp.gather, vp.scatter).
---
 .../llvm/Analysis/TargetTransformInfo.h       |  11 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  16 ++
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |  16 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   6 +
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  27 +++
 .../Target/RISCV/RISCVTargetTransformInfo.h   |   3 +
 .../Transforms/Vectorize/LoopVectorize.cpp    | 197 ++++++++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  17 ++
 llvm/lib/Transforms/Vectorize/VPlan.h         | 107 ++++++++--
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  74 ++++++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   9 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |   1 +
 .../LoopVectorize/RISCV/vp_intrinsics.ll      | 134 ++++++++++++
 .../Transforms/Vectorize/VPlanTest.cpp        |  10 +-
 14 files changed, 548 insertions(+), 80 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 68889bb78233..6ed5ace2ae7d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -23,6 +23,7 @@
 
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/IR/FMF.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -1570,6 +1571,9 @@ public:
   VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const;
   /// @}
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const;
+
   /// @}
 
 private:
@@ -1927,6 +1931,8 @@ public:
                                      Align Alignment) const = 0;
   virtual VPLegalization
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
+  virtual Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                                     ElementCount VF) const = 0;
 };
 
 template <typename T>
@@ -2606,6 +2612,11 @@ public:
   getVPLegalizationStrategy(const VPIntrinsic &PI) const override {
     return Impl.getVPLegalizationStrategy(PI);
   }
+
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const override {
+    return Impl.computeVectorLength(Builder, AVL, VF);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 480be9f723f2..91f2ea473a8a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
@@ -862,6 +863,21 @@ public:
         /* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert);
   }
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const {
+    if (!VF.isScalable()) {
+      return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue());
+    }
+
+    Constant *EC =
+        ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+    Value *VLMax = Builder.CreateVScale(EC, "vlmax");
+    Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl");
+
+    return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin,
+                                   {VLMax, VL}, nullptr, "evl");
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index ce1caafb92fb..243d01e12f14 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -34,6 +34,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -2442,6 +2443,21 @@ public:
 
   InstructionCost getVectorSplitCost() { return 1; }
 
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const {
+    if (!VF.isScalable()) {
+      return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue());
+    }
+
+    Constant *EC =
+        ConstantInt::get(Builder.getInt32Ty(), VF.getKnownMinValue());
+    Value *VLMax = Builder.CreateVScale(EC, "vlmax");
+    Value *VL = Builder.CreateZExtOrTrunc(AVL, Builder.getInt32Ty(), "vl");
+
+    return Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::umin,
+                                   {VLMax, VL}, nullptr, "evl");
+  }
+
   /// @}
 };
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e9c01e68fde2..6928f2bc0b12 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1195,6 +1195,12 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+Value *TargetTransformInfo::computeVectorLength(IRBuilderBase &Builder,
+                                                Value *AVL,
+                                                ElementCount VF) const {
+  return TTIImpl->computeVectorLength(Builder, AVL, VF);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index fcc88d6d4682..222bc663c62e 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/CostTable.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include <cmath>
 #include <optional>
 using namespace llvm;
@@ -1484,3 +1485,29 @@ bool RISCVTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                   C2.NumIVMuls, C2.NumBaseAdds,
                   C2.ScaleCost, C2.ImmCost, C2.SetupCost);
 }
+
+Value *RISCVTTIImpl::computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                                         ElementCount VF) const {
+  if (!VF.isScalable()) {
+    return ConstantInt::get(Builder.getInt32Ty(), VF.getFixedValue());
+  }
+
+  const unsigned SEW = 3; // SEW = 64, TODO: we should use ELEN here.
+  const std::map<unsigned int, unsigned int> LMULArgMap = {
+      {1, 0}, {2, 1}, {4, 2}, {8, 3}};
+
+  assert(AVL->getType()->isIntegerTy() &&
+         "Requested vector length should be an integer.");
+  assert(LMULArgMap.find(VF.getKnownMinValue()) != LMULArgMap.end() &&
+         "Invalid value for LMUL argument.");
+  Value *AVLArg = Builder.CreateZExtOrTrunc(AVL, Builder.getInt64Ty());
+  Constant *SEWArg = ConstantInt::get(Builder.getInt64Ty(), SEW);
+  Constant *LMULArg = ConstantInt::get(Builder.getInt64Ty(),
+                                       LMULArgMap.at(VF.getKnownMinValue()));
+  Value *EVLRes =
+      Builder.CreateIntrinsic(Intrinsic::riscv_vsetvli, {AVLArg->getType()},
+                              {AVLArg, SEWArg, LMULArg}, nullptr, "vl");
+
+  // NOTE: evl type is required to be i32.
+  return Builder.CreateZExtOrTrunc(EVLRes, Builder.getInt32Ty());
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 143079c470fb..3904b6913170 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -327,6 +327,9 @@ public:
 
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2);
+
+  Value *computeVectorLength(IRBuilderBase &Builder, Value *AVL,
+                             ElementCount VF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 92fb82eea714..cc723e712928 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -370,6 +370,10 @@ static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
     cl::desc(
         "Override cost based safe divisor widening for div/rem instructions"));
 
+cl::opt<bool> UseVectorPredicationIntrinsics(
+    "use-vp-intrinsics", cl::init(false), cl::Hidden,
+    cl::desc("Use Vector Predication intrinsics during vectorization."));
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type.
@@ -2890,6 +2894,11 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   if (VectorTripCount)
     return VectorTripCount;
 
+  // With VP intrinsics, we require tail-folding by masking; this way, we
+  // operate on a number of elements equal to the original loop trip count.
+  if (UseVectorPredicationIntrinsics)
+    return VectorTripCount = getOrCreateTripCount(InsertBlock);
+
   Value *TC = getOrCreateTripCount(InsertBlock);
   IRBuilder<> Builder(InsertBlock->getTerminator());
 
@@ -2926,6 +2935,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
   // the step does not evenly divide the trip count, no adjustment is necessary
   // since there will already be scalar iterations. Note that the minimum
   // iterations check ensures that N >= Step.
+  // TODO: we should probably honor the cost model also with VP intrinsics.
   if (Cost->requiresScalarEpilogue(VF)) {
     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
     R = Builder.CreateSelect(IsZero, Step, R);
@@ -8189,12 +8199,13 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
 
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
-                                              Consecutive, Reverse);
+    return new VPWidenMemoryInstructionRecipe(
+        *Load, Operands[0], Mask, Plan->getEVLPhi(), Consecutive, Reverse);
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
-                                            Mask, Consecutive, Reverse);
+                                            Mask, Plan->getEVLPhi(),
+                                            Consecutive, Reverse);
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8224,11 +8235,12 @@ static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
-                                             !NeedsScalarIVOnly);
+                                             !NeedsScalarIVOnly,
+                                             Plan.getEVLPhi());
   }
   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
-  return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
-                                           !NeedsScalarIVOnly);
+  return new VPWidenIntOrFpInductionRecipe(
+      Phi, Start, Step, IndDesc, !NeedsScalarIVOnly, Plan.getEVLPhi());
 }
 
 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
@@ -8698,28 +8710,51 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
 // Add the necessary canonical IV and branch recipes required to control the
 // loop.
 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
-                                  TailFoldingStyle Style) {
-  Value *StartIdx = ConstantInt::get(IdxTy, 0);
-  auto *StartV = Plan.getOrAddVPValue(StartIdx);
+                                  TailFoldingStyle Style,
+                                  const TargetTransformInfo *TTI) {
+  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
+  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
+
+  // Add the EVL recipe, used to calculate the correct IV increment.
+  VPEVLPHIRecipe *EVLRecipe = nullptr;
+  // TODO: TTI should be able to indicate if a target prefers vector predication
+  // intrinsics.
+  if (UseVectorPredicationIntrinsics) {
+    EVLRecipe = new VPEVLPHIRecipe(Plan.getOrCreateTripCount(), TTI);
+    Header->insert(EVLRecipe, Header->begin());
+  }
 
   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
+  Value *StartIdx = ConstantInt::get(IdxTy, 0);
+  auto *StartV = Plan.getOrAddVPValue(StartIdx);
   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
-  VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
-  VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
   Header->insert(CanonicalIVPHI, Header->begin());
 
   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
-  // IV by VF * UF.
+  // IV either by VF * UF or by the EVL values.
   bool HasNUW = Style == TailFoldingStyle::None;
+  SmallVector<VPValue *> IVOps = {CanonicalIVPHI};
+  if (EVLRecipe)
+    IVOps.push_back(EVLRecipe);
   auto *CanonicalIVIncrement =
       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
                                : VPInstruction::CanonicalIVIncrement,
-                        {CanonicalIVPHI}, DL, "index.next");
+                        IVOps, DL, "index.next");
   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
 
   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
   EB->appendRecipe(CanonicalIVIncrement);
 
+  // If we are working with vector predication instrinsics, add a NextEVL
+  // VPInstruction to calculate the remaining elements number.
+  if (EVLRecipe) {
+    auto *NextEVL =
+        new VPInstruction(VPInstruction::NextEVL,
+                          {EVLRecipe, CanonicalIVIncrement}, DL, "evl.next");
+    EVLRecipe->addOperand(NextEVL);
+    EB->appendRecipe(NextEVL);
+  }
+
   if (Style == TailFoldingStyle::DataAndControlFlow) {
     // Create the active lane mask instruction in the vplan preheader.
     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
@@ -8866,7 +8901,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
-                        CM.getTailFoldingStyle());
+                        CM.getTailFoldingStyle(), TTI);
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -9072,7 +9107,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
   Term->eraseFromParent();
 
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
-                        CM.getTailFoldingStyle());
+                        CM.getTailFoldingStyle(), TTI);
   return Plan;
 }
 
@@ -9272,24 +9307,27 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     MulOp = Instruction::FMul;
   }
 
-  // Multiply the vectorization factor by the step using integer or
-  // floating-point arithmetic as appropriate.
-  Type *StepType = Step->getType();
-  Value *RuntimeVF;
-  if (Step->getType()->isFloatingPointTy())
-    RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
-  else
-    RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
-  Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
+  Value *SplatVF = nullptr;
+  if (!getEVL()) {
+    // Multiply the vectorization factor by the step using integer or
+    // floating-point arithmetic as appropriate.
+    Type *StepType = Step->getType();
+    Value *RuntimeVF;
+    if (Step->getType()->isFloatingPointTy())
+      RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
+    else
+      RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
+    Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
 
-  // Create a vector splat to use in the induction update.
-  //
-  // FIXME: If the step is non-constant, we create the vector splat with
-  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
-  //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(State.VF, Mul);
+    // Create a vector splat to use in the induction update.
+    //
+    // FIXME: If the step is non-constant, we create the vector splat with
+    //        IRBuilder. IRBuilder can constant-fold the multiply, but it
+    //        doesn't handle a constant vector splat.
+    SplatVF = isa<Constant>(Mul)
+                  ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
+                  : Builder.CreateVectorSplat(State.VF, Mul);
+  }
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -9304,8 +9342,26 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
     if (isa<TruncInst>(EntryVal))
       State.addMetadata(LastInduction, EntryVal);
 
-    LastInduction = cast<Instruction>(
-        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+    if (auto *EVLRecipe = getEVL()) {
+      // Ensure the types match.
+      Type *DestTy = LastInduction->getType()->getScalarType();
+      Value *EVL = State.get(EVLRecipe, Part);
+      if (DestTy->isIntegerTy()) {
+        EVL = Builder.CreateZExtOrTrunc(EVL, DestTy);
+      } else {
+        assert(DestTy->isFloatingPointTy());
+        EVL = Builder.CreateUIToFP(EVL, DestTy);
+      }
+      // Multiply the EVL by the step using integer or floating-point
+      // arithmetic as appropriate.
+      Value *Mul = Builder.CreateBinOp(MulOp, Step, EVL);
+      Value *SplatEVL = Builder.CreateVectorSplat(State.VF, Mul);
+      LastInduction = cast<Instruction>(
+          Builder.CreateBinOp(AddOp, LastInduction, SplatEVL, "step.add.vl"));
+    } else {
+      LastInduction = cast<Instruction>(
+          Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
+    }
     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
   }
 
@@ -9593,9 +9649,15 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
   bool isMaskRequired = getMask();
-  if (isMaskRequired)
+  VPValue *VPEVL = getEVL();
+  if (isMaskRequired) {
     for (unsigned Part = 0; Part < State.UF; ++Part)
       BlockInMaskParts[Part] = State.get(getMask(), Part);
+  } else if (VPEVL) {
+    auto *MaskTy = VectorType::get(Builder.getInt1Ty(), State.VF);
+    for (unsigned Part = 0; Part < State.UF; ++Part)
+      BlockInMaskParts[Part] = ConstantInt::getTrue(MaskTy);
+  }
 
   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
     // Calculate the pointer for the specific unroll-part.
@@ -9633,7 +9695,14 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         BlockInMaskParts[Part] =
             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
     } else {
-      Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      Value *Increment = nullptr;
+      if (VPEVL) {
+        Increment = Builder.getInt32(0); // EVL is always an i32.
+        for (unsigned int P = 0; P < Part; P++)
+          Increment = Builder.CreateAdd(Increment, State.get(VPEVL, P));
+      } else {
+        Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
+      }
       PartPtr = cast<GetElementPtrInst>(
           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
       PartPtr->setIsInBounds(InBounds);
@@ -9651,10 +9720,19 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
       Instruction *NewSI = nullptr;
       Value *StoredVal = State.get(StoredValue, Part);
       if (CreateGatherScatter) {
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+        Value *MaskPart =
+            (isMaskRequired || VPEVL) ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(getAddr(), Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
+        if (VPEVL) {
+          auto *PtrsTy = cast<VectorType>(VectorGep->getType());
+          Value *Operands[] = {StoredVal, VectorGep, MaskPart,
+                               State.get(VPEVL, Part)};
+          NewSI = Builder.CreateIntrinsic(Intrinsic::vp_scatter,
+                                          {DataTy, PtrsTy}, Operands);
+        } else {
+          NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                              MaskPart);
+        }
       } else {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
@@ -9665,11 +9743,17 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         }
         auto *VecPtr =
             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-        if (isMaskRequired)
+        if (VPEVL) {
+          Value *Operands[] = {StoredVal, VecPtr, BlockInMaskParts[Part],
+                               State.get(VPEVL, Part)};
+          NewSI = Builder.CreateIntrinsic(
+              Intrinsic::vp_store, {DataTy, VecPtr->getType()}, Operands);
+        } else if (isMaskRequired) {
           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
                                             BlockInMaskParts[Part]);
-        else
+        } else {
           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+        }
       }
       State.addMetadata(NewSI, SI);
     }
@@ -9682,21 +9766,37 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   for (unsigned Part = 0; Part < State.UF; ++Part) {
     Value *NewLI;
     if (CreateGatherScatter) {
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+      Value *MaskPart =
+          (isMaskRequired || VPEVL) ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(getAddr(), Part);
-      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
-                                         nullptr, "wide.masked.gather");
+      if (VPEVL) {
+        auto *PtrsTy = cast<VectorType>(VectorGep->getType());
+        Value *Operands[] = {VectorGep, MaskPart, State.get(VPEVL, Part)};
+        NewLI = Builder.CreateIntrinsic(Intrinsic::vp_gather, {DataTy, PtrsTy},
+                                        Operands, nullptr, "vp.gather");
+      } else {
+        NewLI =
+            Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+                                       nullptr, "wide.masked.gather");
+      }
       State.addMetadata(NewLI, LI);
     } else {
       auto *VecPtr =
           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
-      if (isMaskRequired)
+      if (VPEVL) {
+        Value *Operands[] = {VecPtr, BlockInMaskParts[Part],
+                             State.get(VPEVL, Part)};
+        NewLI = Builder.CreateIntrinsic(Intrinsic::vp_load,
+                                        {DataTy, VecPtr->getType()}, Operands,
+                                        nullptr, "vp.load");
+      } else if (isMaskRequired) {
         NewLI = Builder.CreateMaskedLoad(
             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
             PoisonValue::get(DataTy), "wide.masked.load");
-      else
+      } else {
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+      }
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       State.addMetadata(NewLI, LI);
@@ -10530,6 +10630,11 @@ LoopVectorizeResult LoopVectorizePass::runImpl(
 
 PreservedAnalyses LoopVectorizePass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
+    assert((!UseVectorPredicationIntrinsics ||
+            PreferPredicateOverEpilogue ==
+                PreferPredicateTy::PredicateOrDontVectorize) &&
+           "Tail folding required when using VP intrinsics.");
+
     auto &LI = AM.getResult<LoopAnalysis>(F);
     // There are no loops in the function. Return before computing other expensive
     // analyses.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index d554f438c804..81e8b52ebb1f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -596,6 +596,16 @@ VPlan::~VPlan() {
     delete P.second;
 }
 
+VPEVLPHIRecipe *VPlan::getEVLPhi() {
+  VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
+  for (VPRecipeBase &R : Header->phis()) {
+    if (isa<VPEVLPHIRecipe>(&R))
+      return cast<VPEVLPHIRecipe>(&R);
+  }
+  
+  return nullptr;
+}
+
 VPActiveLaneMaskPHIRecipe *VPlan::getActiveLaneMaskPhi() {
   VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
   for (VPRecipeBase &R : Header->phis()) {
@@ -711,6 +721,13 @@ void VPlan::execute(VPTransformState *State) {
     }
 
     auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
+    if (auto *EVLPhi = dyn_cast<VPEVLPHIRecipe>(PhiR)) {
+      PHINode *Phi = EVLPhi->getPhi();
+      Phi->addIncoming(State->get(EVLPhi->getBackedgeValue(), State->UF - 1),
+                       VectorLatchBB);
+      continue;
+    }
+
     // For  canonical IV, first-order recurrences and in-order reduction phis,
     // only a single part is generated, which provides the last part from the
     // previous iteration. For non-ordered reductions all UF parts are
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 986faaf99664..99091246dcda 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -719,10 +719,10 @@ public:
   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
   /// otherwise.
   Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    return cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
   const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
+    return cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -797,7 +797,8 @@ public:
     CanonicalIVIncrementForPart,
     CanonicalIVIncrementForPartNUW,
     BranchOnCount,
-    BranchOnCond
+    BranchOnCond,
+    NextEVL
   };
 
 private:
@@ -1022,20 +1023,30 @@ class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
   const InductionDescriptor &IndDesc;
   bool NeedsVectorIV;
 
+  void addEVL(VPValue *EVLRecipe) {
+    if (EVLRecipe)
+      addOperand(EVLRecipe);
+  }
+
 public:
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 const InductionDescriptor &IndDesc,
-                                bool NeedsVectorIV)
+                                bool NeedsVectorIV, VPValue *EVLRecipe)
       : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
         VPValue(this, IV), IV(IV), IndDesc(IndDesc),
-        NeedsVectorIV(NeedsVectorIV) {}
+        NeedsVectorIV(NeedsVectorIV) {
+    addEVL(EVLRecipe);
+  }
 
   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
                                 const InductionDescriptor &IndDesc,
-                                TruncInst *Trunc, bool NeedsVectorIV)
+                                TruncInst *Trunc, bool NeedsVectorIV,
+                                VPValue *EVLRecipe)
       : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
         VPValue(this, Trunc), IV(IV), IndDesc(IndDesc),
-        NeedsVectorIV(NeedsVectorIV) {}
+        NeedsVectorIV(NeedsVectorIV) {
+    addEVL(EVLRecipe);
+  }
 
   ~VPWidenIntOrFpInductionRecipe() override = default;
 
@@ -1059,6 +1070,12 @@ public:
   VPValue *getStepValue() { return getOperand(1); }
   const VPValue *getStepValue() const { return getOperand(1); }
 
+  /// Return the EVL value of the current loop iteration.
+  VPValue *getEVL() { return getNumOperands() == 3 ? getOperand(2) : nullptr; }
+  const VPValue *getEVL() const {
+    return getNumOperands() == 3 ? getOperand(2) : nullptr;
+  }
+
   /// Returns the first defined value as TruncInst, if it is one or nullptr
   /// otherwise.
   TruncInst *getTruncInst() {
@@ -1629,8 +1646,8 @@ public:
 
 /// A Recipe for widening load/store operations.
 /// The recipe uses the following VPValues:
-/// - For load: Address, optional mask
-/// - For store: Address, stored value, optional mask
+/// - For load: Address, optional mask, optional evl
+/// - For store: Address, stored value, optional mask, optional evl
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
@@ -1642,33 +1659,41 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
   // Whether the consecutive loaded/stored addresses are in reverse order.
   bool Reverse;
 
-  void setMask(VPValue *Mask) {
-    if (!Mask)
-      return;
-    addOperand(Mask);
-  }
+  // Whether the instruction has a not all-ones mask.
+  bool Masked = false;
+
+  // Whether a vector length is available to the instruction.
+  bool HasVL = false;
+
+  void setMaskAndEVL(VPValue *Mask, VPValue *VPEVL) {
+    if (Mask) {
+      this->Masked = true;
+      addOperand(Mask);
+    }
 
-  bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+    if (VPEVL) {
+      this->HasVL = true;
+      addOperand(VPEVL);
+    }
   }
 
 public:
   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                                 bool Consecutive, bool Reverse)
+                                 VPValue *EVL, bool Consecutive, bool Reverse)
       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
         Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
     new VPValue(this, &Load);
-    setMask(Mask);
+    setMaskAndEVL(Mask, EVL);
   }
 
   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
                                  VPValue *StoredValue, VPValue *Mask,
-                                 bool Consecutive, bool Reverse)
+                                 VPValue *EVL, bool Consecutive, bool Reverse)
       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}),
         Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    setMask(Mask);
+    setMaskAndEVL(Mask, EVL);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
@@ -1681,8 +1706,15 @@ public:
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    // Mask is optional and therefore the last operand.
-    return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
+    return Masked ? (HasVL ? getOperand(getNumOperands() - 2)
+                           : getOperand(getNumOperands() - 1))
+                  : nullptr;
+  }
+
+  /// Return the evl used by this recipe. If we are working with full-length
+  /// vectors, return nullptr.
+  VPValue *getEVL() const {
+    return HasVL ? getOperand(getNumOperands() - 1) : nullptr;
   }
 
   /// Returns true if this recipe is a store.
@@ -1826,6 +1858,33 @@ public:
 #endif
 };
 
+class VPEVLPHIRecipe : public VPHeaderPHIRecipe {
+  const TargetTransformInfo *TTI;
+  PHINode *Phi = nullptr;
+
+public:
+  VPEVLPHIRecipe(VPValue *StartEVL, const TargetTransformInfo *TTI)
+      : VPHeaderPHIRecipe(VPDef::VPWidenEVLSC, nullptr, StartEVL), TTI(TTI) {}
+
+  ~VPEVLPHIRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenEVLSC)
+
+  PHINode *getPhi() const { return Phi; }
+
+  static inline bool classof(const VPHeaderPHIRecipe *D) {
+    return D->getVPDefID() == VPDef::VPWidenEVLSC;
+  }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A Recipe for widening the canonical induction variable of the vector loop.
 class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 public:
@@ -2367,6 +2426,10 @@ public:
     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
   }
 
+  /// Find and return the VPEVLPHIRecipe from the header - there should be only
+  /// one at most. If there isn't one, then return nullptr.
+  VPEVLPHIRecipe *getEVLPhi();
+
   /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
   /// be only one at most. If there isn't one, then return nullptr.
   VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index ff0b1df57ce4..1b69ac5d3d71 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -35,6 +35,7 @@ using namespace llvm;
 using VectorParts = SmallVector<Value *, 2>;
 
 extern cl::opt<bool> EnableVPlanNativePath;
+extern cl::opt<bool> UseVectorPredicationIntrinsics;
 
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
@@ -235,6 +236,15 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     break;
   }
   case VPInstruction::ActiveLaneMask: {
+    if (UseVectorPredicationIntrinsics) {
+      State.set(this,
+                ConstantInt::getTrue(
+                    VectorType::get(State.Builder.getInt1Ty(), State.VF)),
+                Part);
+
+      break;
+    }
+
     // Get first lane of vector induction variable.
     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
     // Get the original loop tripcount.
@@ -279,10 +289,21 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     if (Part == 0) {
       bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
       auto *Phi = State.get(getOperand(0), 0);
-      // The loop step is equal to the vectorization factor (num of SIMD
-      // elements) times the unroll factor (num of SIMD instructions).
-      Value *Step =
-          createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+      Value *Step = nullptr;
+      if (getNumOperands() == 2) {
+        // We have the EVL value available to use.
+        VPValue *VPEVL = getOperand(1);
+        Step = State.get(VPEVL, 0);
+        for (unsigned P = 1; P < State.UF; P++)
+          Step = Builder.CreateAdd(Step, State.get(VPEVL, P));
+
+        Step = Builder.CreateZExtOrTrunc(Step, Phi->getType());
+      } else {
+        // The loop step is equal to the vectorization factor (num of SIMD
+        // elements) times the unroll factor (num of SIMD instructions).
+        Step = createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
+      }
+
       Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
     } else {
       Next = State.get(this, 0);
@@ -353,6 +374,21 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
     break;
   }
+  case VPInstruction::NextEVL: {
+    Value *Next = nullptr;
+    if (Part == 0) {
+      auto *EVLRecipe = cast<VPEVLPHIRecipe>(getOperand(0));
+      Value *StartEVL = State.get(EVLRecipe->getOperand(0), 0);
+      Value *IVIncrement = State.get(getOperand(1), 0);
+
+      Next = Builder.CreateSub(StartEVL, IVIncrement, "evl.next");
+    } else {
+      Next = State.get(this, 0);
+    }
+
+    State.set(this, Next, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -719,6 +755,9 @@ void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
 #endif
 
 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
+  if (getEVL())
+    return false;
+
   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
   auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
   return StartC && StartC->isZero() && StepC && StepC->isOne();
@@ -1329,3 +1368,30 @@ void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 #endif
+
+void VPEVLPHIRecipe::execute(VPTransformState &State) {
+  Value *StartEVL = State.get(getOperand(0), 0);
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  this->Phi = State.Builder.CreatePHI(StartEVL->getType(), 2, "evl.phi");
+  this->Phi->addIncoming(StartEVL, VectorPH);
+
+  Value *PrevEVL = State.Builder.CreateZExtOrTrunc(
+      cast<Value>(this->Phi), State.Builder.getInt32Ty(), "evl.phi.cast");
+  Value *EVL = nullptr;
+  for (unsigned Part = 0; Part < State.UF; Part++) {
+    if (EVL)
+      PrevEVL = State.Builder.CreateSub(PrevEVL, EVL);
+    EVL = TTI->computeVectorLength(State.Builder, PrevEVL, State.VF);
+    State.set(this, EVL, Part);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPEVLPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << Indent << "EVL-PHI ";
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+#endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1cfba64f1fbe..5070aa9a8dff 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -55,8 +55,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
           VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
           VPValue *Step =
               vputils::getOrCreateVPValueForSCEVExpr(*Plan, II->getStep(), SE);
-          NewRecipe =
-              new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, *II, true);
+          NewRecipe = new VPWidenIntOrFpInductionRecipe(
+              Phi, Start, Step, *II, true, Plan->getEVLPhi());
         } else {
           Plan->addVPValue(Phi, VPPhi);
           continue;
@@ -69,12 +69,13 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
-              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/);
+              nullptr /*Mask*/, nullptr /*EVL*/, false /*Consecutive*/,
+              false /*Reverse*/);
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenMemoryInstructionRecipe(
               *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
               Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/);
+              nullptr /*EVL*/, false /*Consecutive*/, false /*Reverse*/);
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(
               GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 62ec65cbfe5d..994a677a5dba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -354,6 +354,7 @@ public:
     VPCanonicalIVPHISC,
     VPActiveLaneMaskPHISC,
     VPFirstOrderRecurrencePHISC,
+    VPWidenEVLSC,
     VPWidenPHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll
new file mode 100644
index 000000000000..ae636428f935
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vp_intrinsics.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=loop-vectorize -use-vp-intrinsics -prefer-predicate-over-epilogue=predicate-dont-vectorize -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/simple.c'
+source_filename = "custom/simple.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++)
+;     C[I] = A[I] + B[I];
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C1:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 -1, [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 10, i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C1]], [[A2]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C1]], [[B3]]
+; CHECK-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI_CAST:%.*]] = trunc i64 [[EVL_PHI]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[EVL_PHI_CAST]] to i64
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 1 x i64> zeroinitializer, [[TMP12]]
+; CHECK-NEXT:    [[VEC_IV:%.*]] = add <vscale x 1 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP15]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i32 0
+; CHECK-NEXT:    [[VP_LOAD5:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP17]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 1 x double> [[VP_LOAD]], [[VP_LOAD5]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[TMP18]], ptr [[TMP20]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP21]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP23:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP23]], [[TMP24]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08
+  %0 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08
+  %1 = load double, ptr %arrayidx1, align 8, !tbaa !4
+  %add = fadd double %0, %1
+  %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08
+  store double %add, ptr %arrayidx2, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.08, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !8
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9}
+!9 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index ff7ee53bfbcf..14462f0ef6b2 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1003,7 +1003,8 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
   VPValue Addr;
   VPValue Mask;
-  VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false);
+  VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true,
+                                        false);
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1099,7 +1100,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
     VPValue Addr;
     VPValue Mask;
-    VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false);
+    VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, nullptr, true,
+                                          false);
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1113,8 +1115,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue Addr;
     VPValue Mask;
     VPValue StoredV;
-    VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false,
-                                          false);
+    VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask,
+                                          nullptr, false, false);
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
-- 
GitLab


From feb66a09e6362c73678a2c5beb558ce641304285 Mon Sep 17 00:00:00 2001
From: Lorenzo Albano <lorenzo.albano@bsc.es>
Date: Thu, 23 Feb 2023 14:29:51 +0000
Subject: [PATCH 2/2] Add VectorPredication pass.

This pass transforms full-length vector instructions to VP ones by
recovering the (mask,evl) information from one of the memory writing VP
operations and backpropagating it.
---
 .../Transforms/Vectorize/VectorPredication.h  |  55 ++++
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |  10 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Vectorize/VectorPredication.cpp           | 277 ++++++++++++++++++
 .../VectorPredication/if-elif-else.ll         | 270 +++++++++++++++++
 .../VectorPredication/if-else_scalar-cond.ll  | 209 +++++++++++++
 .../VectorPredication/if-else_vec-cond.ll     | 219 ++++++++++++++
 .../VectorPredication/simple_vector_sum.ll    | 193 ++++++++++++
 10 files changed, 1236 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/VectorPredication.h
 create mode 100644 llvm/lib/Transforms/Vectorize/VectorPredication.cpp
 create mode 100644 llvm/test/Transforms/VectorPredication/if-elif-else.ll
 create mode 100644 llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll
 create mode 100644 llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll
 create mode 100644 llvm/test/Transforms/VectorPredication/simple_vector_sum.ll

diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h
new file mode 100644
index 000000000000..ce59854dbb95
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/VectorPredication.h
@@ -0,0 +1,55 @@
+#ifndef LLVM_TRANSFORMS_VECTORPREDICATION_H
+#define LLVM_TRANSFORMS_VECTORPREDICATION_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+using InstToMaskEVLMap = DenseMap<Instruction *, std::pair<Value *, Value *>>;
+
+struct BlockData {
+  // Vector that stores all vector predicated memory writing operations found in
+  // the basic block. If after phase 1 is empty, then the basic block can be
+  // skipped by following phases.
+  SmallVector<Instruction *> MemoryWritingVPInstructions;
+
+  // Store all instructions of the basic block (in the same order as they are
+  // found), assigning to each the list of users. Skip PHIs and terminators.
+  MapVector<Instruction *, SmallPtrSet<Instruction *, 4>> TopologicalGraph;
+
+  // Map each full-length vector operation eligible to be transformed to a
+  // vector predication one with the (mask,evl) pair of its first vector
+  // predicated memory writing operation user.
+  InstToMaskEVLMap VecOpsToTransform;
+
+  // Ordered list representing the reverse order of how the basic block has to
+  // be transformed due to the new vector predicated instructions.
+  SmallVector<Instruction *> NewBBReverseOrder;
+
+  BlockData() = default;
+};
+
+class VectorPredicationPass : public PassInfoMixin<VectorPredicationPass> {
+private:
+  // List of instructions to be replaced by the new VP operations and that later
+  // should be removed, if possible.
+  DenseMap<Instruction *, Value *> OldInstructionsToRemove;
+
+  void analyseBasicBlock(BasicBlock &BB, BlockData &BBInfo);
+  void findCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo);
+  void addNewUsersToMasksAndEVLs(BasicBlock &BB, BlockData &BBInfo);
+  void buildNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo);
+  void emitNewBasicBlockSchedule(BasicBlock &BB, BlockData &BBInfo);
+  void transformCandidateVectorOperations(BasicBlock &BB, BlockData &BBInfo);
+
+  void removeOldInstructions();
+
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static StringRef name() { return "VectorPredicationPass"; }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORPREDICATION_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 4b8754df7fb6..5b9f2da07873 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -256,6 +256,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 6cc66a0cb132..4423c2c87072 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -129,6 +129,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
 
 using namespace llvm;
 
@@ -286,6 +287,11 @@ static cl::opt<AttributorRunOption> AttributorRun(
                clEnumValN(AttributorRunOption::NONE, "none",
                           "disable attributor runs")));
 
+static cl::opt<bool>
+    EnableVectorPredication("enable-vector-predication", cl::init(false),
+                            cl::Hidden,
+                            cl::desc("Enable VectorPredicationPass."));
+
 PipelineTuningOptions::PipelineTuningOptions() {
   LoopInterleaving = true;
   LoopVectorization = true;
@@ -1230,6 +1236,10 @@ void PassBuilder::addVectorPasses(OptimizationLevel Level,
         /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true));
   }
 
+  // Try to vector predicate vectorized functions.
+  if (EnableVectorPredication)
+    FPM.addPass(VectorPredicationPass());
+
   // Now that we've vectorized and unrolled loops, we may have more refined
   // alignment information, try to re-derive it here.
   FPM.addPass(AlignmentFromAssumptionsPass());
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 73ab87dd8823..f40aedacaa89 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -391,6 +391,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("vector-predication", VectorPredicationPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575..bc9e4d281638 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
+  VectorPredication.cpp
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
diff --git a/llvm/lib/Transforms/Vectorize/VectorPredication.cpp b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp
new file mode 100644
index 000000000000..cc6137a134d5
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorPredication.cpp
@@ -0,0 +1,277 @@
+#include "llvm/Transforms/Vectorize/VectorPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/VectorBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#define DEBUG_TYPE "vector-predication"
+STATISTIC(Transforms, "Number of full-length -> evl vector transformation.");
+
+using namespace llvm;
+
+// Map each instruction to its uses and save all memory writing vector
+// predicated instructions found in the basic block.
+void VectorPredicationPass::analyseBasicBlock(BasicBlock &BB,
+                                              BlockData &BBInfo) {
+  for (Instruction &I : BB) {
+    if (isa<PHINode>(I) || I.isTerminator())
+      continue;
+
+    SmallPtrSet<Instruction *, 4> IUsers;
+    for (User *IU : I.users()) {
+      assert(isa<Instruction>(IU) && "Unexpected behaviour.");
+      auto *IUInst = cast<Instruction>(IU);
+      if (IUInst->getParent() != I.getParent())
+        continue;
+      if (isa<PHINode>(IUInst) || IUInst->isTerminator())
+        continue;
+
+      IUsers.insert(IUInst);
+    }
+    BBInfo.TopologicalGraph.insert({&I, IUsers});
+
+    if (auto *CI = dyn_cast<CallInst>(&I)) {
+      if (auto *CF = CI->getCalledFunction()) {
+        Intrinsic::ID ID = CF->getIntrinsicID();
+        if (ID == Intrinsic::vp_store || ID == Intrinsic::vp_scatter) {
+          BBInfo.MemoryWritingVPInstructions.push_back(&I);
+        }
+      }
+    }
+  }
+}
+
+static void findCandidateVectorOperation(BasicBlock &BB, Value *Op, Value *Mask,
+                                         Value *EVL,
+                                         InstToMaskEVLMap &VecOpsToTransform) {
+  auto *OpInst = dyn_cast<Instruction>(Op);
+  if (!OpInst)
+    return;
+
+  if (OpInst->getParent() != &BB)
+    return;
+
+  Intrinsic::ID VPID = VPIntrinsic::getForOpcode(OpInst->getOpcode());
+  if (VPID == Intrinsic::not_intrinsic)
+    return;
+
+  // If the instruction is already present in the map, it means it was already
+  // visited starting from a previous memory wrtiting vp operation.
+  if (!VecOpsToTransform
+           .insert(std::make_pair(OpInst, std::make_pair(Mask, EVL)))
+           .second) {
+    // We need to check if new mask and evl values differ from the old ones:
+    // - if they are the same, then there is nothing to do;
+    // - if only the mask differ, we use an allones mask;
+    // - otherwise, we remove the instruction from the map (i.e., no
+    //   transformation should happen)
+    auto It = VecOpsToTransform.find(OpInst);
+    assert(It != VecOpsToTransform.end());
+    Value *OldMask, *OldEVL;
+    std::tie(OldMask, OldEVL) = It->second;
+
+    if (Mask == OldMask && EVL == OldEVL)
+      return;
+
+    VecOpsToTransform.erase(OpInst);
+    if (EVL == OldEVL) {
+      VecOpsToTransform.insert(
+          std::make_pair(OpInst, std::make_pair(nullptr, EVL)));
+    }
+  }
+
+  // Recursively visit OpInst operands.
+  switch (VPID) {
+  default:
+    for (auto *OpVal : OpInst->operand_values())
+      findCandidateVectorOperation(BB, OpVal, Mask, EVL, VecOpsToTransform);
+    break;
+  case Intrinsic::vp_select: {
+    Value *Cond = OpInst->getOperand(0);
+    if (Cond->getType()->isVectorTy())
+      findCandidateVectorOperation(BB, Cond, nullptr, EVL, VecOpsToTransform);
+
+    // TODO: if the condition argument is a vector, we could backpropagate it
+    // as mask for the true branch and its negation as mask for the false one.
+    // WARNING: when creating the negation of the condition, we must ensure it
+    // dominates all uses.
+    findCandidateVectorOperation(BB, OpInst->getOperand(1), nullptr, EVL,
+                                 VecOpsToTransform);
+    findCandidateVectorOperation(BB, OpInst->getOperand(2), nullptr, EVL,
+                                 VecOpsToTransform);
+    break;
+  }
+  }
+}
+
+// For each vector predicated memory writing operation of the basic block, go
+// back to the stored vector defining instruction and verify it is a vector
+// operation. Add it to the list of instructions to be transformed into vector
+// predicated ones, then recursively repeat the process for its vector
+// arguments.
+void VectorPredicationPass::findCandidateVectorOperations(BasicBlock &BB,
+                                                          BlockData &BBInfo) {
+  if (BBInfo.MemoryWritingVPInstructions.empty())
+    return;
+
+  for (Instruction *I : BBInfo.MemoryWritingVPInstructions) {
+    assert(I->getParent() == &BB && "This is not the right basic block");
+    auto *VPI = cast<VPIntrinsic>(I);
+    Value *StoredOperand = VPI->getMemoryDataParam();
+    Value *MaskOperand = VPI->getMaskParam();
+    Value *EVLOperand = VPI->getVectorLengthParam();
+    // First, visit the mask operand (assigning an allones mask to this branch)
+    // and only then visit the stored operand.
+    findCandidateVectorOperation(BB, MaskOperand, nullptr, EVLOperand,
+                                 BBInfo.VecOpsToTransform);
+    findCandidateVectorOperation(BB, StoredOperand, MaskOperand, EVLOperand,
+                                 BBInfo.VecOpsToTransform);
+  }
+}
+
+// Add the candidates as users of the mask and evl linked to each of them.
+void VectorPredicationPass::addNewUsersToMasksAndEVLs(BasicBlock &BB,
+                                                      BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  for (auto [K, V] : BBInfo.VecOpsToTransform) {
+    if (auto *MaskInst = dyn_cast_if_present<Instruction>(V.first))
+      BBInfo.TopologicalGraph[MaskInst].insert(K);
+    if (auto *EVLInst = dyn_cast<Instruction>(V.second))
+      BBInfo.TopologicalGraph[EVLInst].insert(K);
+  }
+}
+
+// Topologically sort, preserving as much as possible the original order.
+void VectorPredicationPass::buildNewBasicBlockSchedule(BasicBlock &BB,
+                                                       BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  while (!BBInfo.TopologicalGraph.empty()) {
+    Instruction *Inst = nullptr;
+    for (auto B = BBInfo.TopologicalGraph.rbegin(),
+              E = BBInfo.TopologicalGraph.rend();
+         B != E; B++) {
+      if (B->second.empty()) {
+        Inst = B->first;
+        break;
+      }
+    }
+    assert(Inst && "Failed to empty topological graph!");
+
+    BBInfo.NewBBReverseOrder.push_back(Inst);
+    BBInfo.TopologicalGraph.erase(Inst);
+
+    for (auto B = BBInfo.TopologicalGraph.begin(),
+              E = BBInfo.TopologicalGraph.end();
+         B != E; B++) {
+      B->second.erase(Inst);
+    }
+  }
+}
+
+// Modify the basic block based on the topological order generated.
+void VectorPredicationPass::emitNewBasicBlockSchedule(BasicBlock &BB,
+                                                      BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  Instruction *InsertPoint = BB.getTerminator();
+  for (Instruction *I : BBInfo.NewBBReverseOrder) {
+    I->moveBefore(InsertPoint);
+    InsertPoint = I;
+  }
+}
+
+// Transform candidates to vector predicated instructions.
+void VectorPredicationPass::transformCandidateVectorOperations(
+    BasicBlock &BB, BlockData &BBInfo) {
+  if (BBInfo.VecOpsToTransform.empty())
+    return;
+
+  for (auto [I, P] : BBInfo.VecOpsToTransform) {
+    Value *Mask, *EVL;
+    std::tie(Mask, EVL) = P;
+
+    IRBuilder<> Builder(I);
+    unsigned int Opcode = I->getOpcode();
+    Type *RetTy = I->getType();
+    SmallVector<Value *> Operands(I->value_op_begin(), I->value_op_end());
+    switch (Opcode) {
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      Operands.clear();
+      auto *CmpI = cast<CmpInst>(I);
+      Value *PredOp = MetadataAsValue::get(
+          Builder.getContext(),
+          MDString::get(Builder.getContext(),
+                        CmpInst::getPredicateName(CmpI->getPredicate())));
+      Operands = {CmpI->getOperand(0), CmpI->getOperand(1), PredOp};
+      break;
+    }
+    case Instruction::Select: {
+      if (!I->getOperand(0)->getType()->isVectorTy()) {
+        Operands.clear();
+        Value *Op1 = I->getOperand(1);
+        Value *Op2 = I->getOperand(2);
+        Value *Cond = Builder.CreateVectorSplat(
+            cast<VectorType>(Op1->getType())->getElementCount(),
+            I->getOperand(0), "select.cond.splat");
+        Operands = {Cond, Op1, Op2};
+      }
+      break;
+    }
+    default:
+      break;
+    }
+
+    if (!Mask)
+      // nullptr means unmasked operation, hence we use an all-ones mask.
+      Mask = ConstantInt::getTrue(RetTy->getWithNewType(Builder.getInt1Ty()));
+
+    VectorBuilder VecBuilder(Builder);
+    VecBuilder.setMask(Mask).setEVL(EVL);
+    Value *NewVPOp =
+        VecBuilder.createVectorInstruction(Opcode, RetTy, Operands, "vp.op");
+
+    Transforms++; // Stats
+    OldInstructionsToRemove.insert(std::make_pair(I, NewVPOp));
+  }
+}
+
+// Remove old instructions, if possible.
+void VectorPredicationPass::removeOldInstructions() {
+  for (auto [I, NewVPOp] : OldInstructionsToRemove) {
+    I->replaceAllUsesWith(NewVPOp);
+    if (isInstructionTriviallyDead(I))
+      I->eraseFromParent();
+  }
+}
+
+PreservedAnalyses VectorPredicationPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  assert(OldInstructionsToRemove.empty() &&
+         "Map should be cleared at the end of each run of the pass.");
+
+  for (BasicBlock &BB : F) {
+    BlockData BBInfo;
+
+    analyseBasicBlock(BB, BBInfo);
+    findCandidateVectorOperations(BB, BBInfo);
+    addNewUsersToMasksAndEVLs(BB, BBInfo);
+    buildNewBasicBlockSchedule(BB, BBInfo);
+    emitNewBasicBlockSchedule(BB, BBInfo);
+    transformCandidateVectorOperations(BB, BBInfo);
+  }
+
+  removeOldInstructions();
+  OldInstructionsToRemove.clear();
+
+  // TODO: think about which analysis are preserved.
+  return PreservedAnalyses::none();
+}
diff --git a/llvm/test/Transforms/VectorPredication/if-elif-else.ll b/llvm/test/Transforms/VectorPredication/if-elif-else.ll
new file mode 100644
index 000000000000..761d3bfe9d0b
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-elif-else.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-elif-else.c'
+source_filename = "custom/if-elif-else.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (N < 50)
+;       C[I] = A[I] + B[I];
+;     else if (N > 75)
+;       C[I] = A[I] * B[I];
+;     else
+;       C[I] = 2 * A[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP30:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP30]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[N]], 50
+; CHECK-NEXT:    [[CMP4:%.*]] = icmp ugt i64 [[N]], 75
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[UGLYGEP32:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[UGLYGEP33:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt ptr [[UGLYGEP32]], [[C]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt ptr [[UGLYGEP]], [[A]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND034:%.*]] = icmp ugt ptr [[UGLYGEP33]], [[C]]
+; CHECK-NEXT:    [[BOUND135:%.*]] = icmp ugt ptr [[UGLYGEP]], [[B]]
+; CHECK-NEXT:    [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT36]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT37:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT38:%.*]] = shufflevector <vscale x 1 x i1> [[BROADCAST_SPLATINSERT37]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT39:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT40:%.*]] = shufflevector <vscale x 1 x i1> [[BROADCAST_SPLATINSERT39]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <vscale x 1 x i1> [[BROADCAST_SPLAT38]], shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 1 x i1> [[TMP5]], <vscale x 1 x i1> [[BROADCAST_SPLAT40]], <vscale x 1 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = select <vscale x 1 x i1> [[BROADCAST_SPLAT38]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> [[BROADCAST_SPLAT40]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP8]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP10]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4:![0-9]+]], !alias.scope !8
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 2.000000e+00, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD41:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> [[TMP6]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11
+; CHECK-NEXT:    [[VP_OP3:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD41]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_LOAD42:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> [[BROADCAST_SPLAT38]], i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !11
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD42]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[TMP7]], <vscale x 1 x double> [[VP_OP2]], <vscale x 1 x double> [[VP_OP1]], i32 [[TMP9]])
+; CHECK-NEXT:    [[VP_OP4:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[TMP6]], <vscale x 1 x double> [[VP_OP3]], <vscale x 1 x double> [[VP_OP]], i32 [[TMP9]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP4]], ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP9]]), !tbaa [[TBAA4]], !alias.scope !13, !noalias !15
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_END_LOOPEXIT44:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_031:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else:
+; CHECK-NEXT:    br i1 [[CMP4]], label [[IF_THEN5:%.*]], label [[IF_ELSE9:%.*]]
+; CHECK:       if.then5:
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_031]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX7]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       if.else9:
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP14]], 2.000000e+00
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi double [ [[ADD]], [[IF_THEN]] ], [ [[MUL11]], [[IF_ELSE9]] ], [ [[MUL]], [[IF_THEN5]] ]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_031]]
+; CHECK-NEXT:    store double [[ADD_SINK]], ptr [[ARRAYIDX3]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_031]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit44:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp30 = icmp sgt i64 %N, 0
+  br i1 %cmp30, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp ult i64 %N, 50
+  %cmp4 = icmp ugt i64 %N, 75
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 10)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader, label %vector.memcheck
+
+for.body.preheader:                               ; preds = %vector.memcheck, %for.body.lr.ph
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.lr.ph
+  %4 = shl i64 %N, 3
+  %uglygep = getelementptr i8, ptr %C, i64 %4
+  %uglygep32 = getelementptr i8, ptr %A, i64 %4
+  %uglygep33 = getelementptr i8, ptr %B, i64 %4
+  %bound0 = icmp ugt ptr %uglygep32, %C
+  %bound1 = icmp ugt ptr %uglygep, %A
+  %found.conflict = and i1 %bound0, %bound1
+  %bound034 = icmp ugt ptr %uglygep33, %C
+  %bound135 = icmp ugt ptr %uglygep, %B
+  %found.conflict36 = and i1 %bound034, %bound135
+  %conflict.rdx = or i1 %found.conflict, %found.conflict36
+  br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.memcheck
+  %broadcast.splatinsert37 = insertelement <vscale x 1 x i1> poison, i1 %cmp1, i64 0
+  %broadcast.splat38 = shufflevector <vscale x 1 x i1> %broadcast.splatinsert37, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %broadcast.splatinsert39 = insertelement <vscale x 1 x i1> poison, i1 %cmp4, i64 0
+  %broadcast.splat40 = shufflevector <vscale x 1 x i1> %broadcast.splatinsert39, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %5 = xor <vscale x 1 x i1> %broadcast.splat38, shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer)
+  %6 = select <vscale x 1 x i1> %5, <vscale x 1 x i1> %broadcast.splat40, <vscale x 1 x i1> zeroinitializer
+  %7 = select <vscale x 1 x i1> %broadcast.splat38, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), <vscale x 1 x i1> %broadcast.splat40
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ]
+  %8 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %8, i64 3, i64 0)
+  %9 = trunc i64 %vl to i32
+  %10 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %10, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %9), !tbaa !4, !alias.scope !8
+  %11 = fmul <vscale x 1 x double> %vp.load, shufflevector (<vscale x 1 x double> insertelement (<vscale x 1 x double> poison, double 2.000000e+00, i64 0), <vscale x 1 x double> poison, <vscale x 1 x i32> zeroinitializer)
+  %12 = getelementptr double, ptr %B, i64 %index
+  %vp.load41 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> %6, i32 %9), !tbaa !4, !alias.scope !11
+  %13 = fmul <vscale x 1 x double> %vp.load, %vp.load41
+  %vp.load42 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> %broadcast.splat38, i32 %9), !tbaa !4, !alias.scope !11
+  %14 = fadd <vscale x 1 x double> %vp.load, %vp.load42
+  %predphi = select <vscale x 1 x i1> %7, <vscale x 1 x double> %14, <vscale x 1 x double> %11
+  %predphi43 = select <vscale x 1 x i1> %6, <vscale x 1 x double> %13, <vscale x 1 x double> %predphi
+  %15 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %predphi43, ptr %15, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %9), !tbaa !4, !alias.scope !13, !noalias !15
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %16 = icmp eq i64 %index.next, %N
+  br i1 %16, label %for.end.loopexit44, label %vector.body, !llvm.loop !16
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %I.031 = phi i64 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.031
+  %17 = load double, ptr %arrayidx, align 8, !tbaa !4
+  br i1 %cmp1, label %if.then, label %if.else
+
+if.then:                                          ; preds = %for.body
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.031
+  %18 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %17, %18
+  br label %for.inc
+
+if.else:                                          ; preds = %for.body
+  br i1 %cmp4, label %if.then5, label %if.else9
+
+if.then5:                                         ; preds = %if.else
+  %arrayidx7 = getelementptr inbounds double, ptr %B, i64 %I.031
+  %19 = load double, ptr %arrayidx7, align 8, !tbaa !4
+  %mul = fmul double %17, %19
+  br label %for.inc
+
+if.else9:                                         ; preds = %if.else
+  %mul11 = fmul double %17, 2.000000e+00
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.then, %if.else9, %if.then5
+  %add.sink = phi double [ %add, %if.then ], [ %mul11, %if.else9 ], [ %mul, %if.then5 ]
+  %arrayidx3 = getelementptr inbounds double, ptr %C, i64 %I.031
+  store double %add.sink, ptr %arrayidx3, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.031, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !20
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end
+
+for.end.loopexit44:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit44, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!9}
+!9 = distinct !{!9, !10}
+!10 = distinct !{!10, !"LVerDomain"}
+!11 = !{!12}
+!12 = distinct !{!12, !10}
+!13 = !{!14}
+!14 = distinct !{!14, !10}
+!15 = !{!9, !12}
+!16 = distinct !{!16, !17, !18, !19}
+!17 = !{!"llvm.loop.mustprogress"}
+!18 = !{!"llvm.loop.isvectorized", i32 1}
+!19 = !{!"llvm.loop.unroll.runtime.disable"}
+!20 = distinct !{!20, !17, !18}
diff --git a/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll
new file mode 100644
index 000000000000..ed8f28feeffc
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-else_scalar-cond.ll
@@ -0,0 +1,209 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-else2.c'
+source_filename = "custom/if-else2.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (N < 50)
+;       C[I] = A[I] + B[I];
+;     else
+;       C[I] = A[I] * B[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP18]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[N]], 50
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C20]], [[A21]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C20]], [[B22]]
+; CHECK-NEXT:    [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY_PREHEADER:%.*]]
+; CHECK:       vector.body.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD24:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[SELECT_COND_SPLAT_SPLATINSERT:%.*]] = insertelement <vscale x 1 x i1> poison, i1 [[CMP1]], i64 0
+; CHECK-NEXT:    [[SELECT_COND_SPLAT_SPLAT:%.*]] = shufflevector <vscale x 1 x i1> [[SELECT_COND_SPLAT_SPLATINSERT]], <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[SELECT_COND_SPLAT_SPLAT]], <vscale x 1 x double> [[VP_OP]], <vscale x 1 x double> [[VP_OP2]], i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP1]], ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END_LOOPEXIT25:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]]
+; CHECK-NEXT:    store double [[MUL_SINK]], ptr [[TMP17]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit25:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B22 = ptrtoint ptr %B to i64
+  %A21 = ptrtoint ptr %A to i64
+  %C20 = ptrtoint ptr %C to i64
+  %cmp18 = icmp sgt i64 %N, 0
+  br i1 %cmp18, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp ult i64 %N, 50
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 8)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader, label %vector.memcheck
+
+for.body.preheader:                               ; preds = %vector.memcheck, %for.body.lr.ph
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.lr.ph
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C20, %A21
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C20, %B22
+  %diff.check23 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check23
+  br i1 %conflict.rdx, label %for.body.preheader, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
+  %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ]
+  %9 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0)
+  %10 = trunc i64 %vl to i32
+  %11 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %11, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %12 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load24 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %13 = fadd <vscale x 1 x double> %vp.load, %vp.load24
+  %14 = fmul <vscale x 1 x double> %vp.load, %vp.load24
+  %15 = select i1 %cmp1, <vscale x 1 x double> %13, <vscale x 1 x double> %14
+  %16 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %15, ptr %16, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10)
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %17 = icmp eq i64 %index.next, %N
+  br i1 %17, label %for.end.loopexit25, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019
+  %18 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019
+  %19 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %18, %19
+  %mul = fmul double %18, %19
+  %mul.sink = select i1 %cmp1, double %add, double %mul
+  %20 = getelementptr inbounds double, ptr %C, i64 %I.019
+  store double %mul.sink, ptr %20, align 8
+  %inc = add nuw nsw i64 %I.019, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit25:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit25, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
diff --git a/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll
new file mode 100644
index 000000000000..9c25aec38fdb
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/if-else_vec-cond.ll
@@ -0,0 +1,219 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/if-else1.c'
+source_filename = "custom/if-else1.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++) {
+;     if (I < 50)
+;       C[I] = A[I] + B[I];
+;     else
+;       C[I] = A[I] * B[I];
+;   }
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B22:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A21:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C20:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP18:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP18]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER25:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader25:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C20]], [[A21]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C20]], [[B22]]
+; CHECK-NEXT:    [[DIFF_CHECK23:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK23]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER25]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 1 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP10]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 1 x i64> poison, i64 [[VL]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 1 x i64> [[DOTSPLATINSERT]], <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+; CHECK-NEXT:    [[VP_OP1:%.*]] = call <vscale x 1 x i1> @llvm.vp.icmp.nxv1i64(<vscale x 1 x i64> [[VEC_IND]], <vscale x 1 x i64> shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 50, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer), metadata !"ult", <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD24:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[VP_OP3:%.*]] = call <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD24]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[VP_OP2:%.*]] = call <vscale x 1 x double> @llvm.vp.select.nxv1f64(<vscale x 1 x i1> [[VP_OP1]], <vscale x 1 x double> [[VP_OP]], <vscale x 1 x double> [[VP_OP3]], i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP2]], ptr [[TMP14]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP11]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 1 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[FOR_END_LOOPEXIT26:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_019:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER25]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[I_019]], 50
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_019]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load double, ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[MUL_SINK:%.*]] = select i1 [[CMP1]], double [[ADD]], double [[MUL]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_019]]
+; CHECK-NEXT:    store double [[MUL_SINK]], ptr [[TMP18]], align 8
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_019]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit26:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B22 = ptrtoint ptr %B to i64
+  %A21 = ptrtoint ptr %A to i64
+  %C20 = ptrtoint ptr %C to i64
+  %cmp18 = icmp sgt i64 %N, 0
+  br i1 %cmp18, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 8)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader25, label %vector.memcheck
+
+for.body.preheader25:                             ; preds = %vector.memcheck, %for.body.preheader
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C20, %A21
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C20, %B22
+  %diff.check23 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check23
+  br i1 %conflict.rdx, label %for.body.preheader25, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.memcheck
+  %9 = call <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64()
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %evl.phi = phi i64 [ %N, %vector.ph ], [ %evl.next, %vector.body ]
+  %vec.ind = phi <vscale x 1 x i64> [ %9, %vector.ph ], [ %vec.ind.next, %vector.body ]
+  %10 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %10, i64 3, i64 0)
+  %11 = trunc i64 %vl to i32
+  %.splatinsert = insertelement <vscale x 1 x i64> poison, i64 %vl, i64 0
+  %.splat = shufflevector <vscale x 1 x i64> %.splatinsert, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %12 = icmp ult <vscale x 1 x i64> %vec.ind, shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 50, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
+  %13 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %13, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11), !tbaa !4
+  %14 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load24 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %14, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11), !tbaa !4
+  %15 = fadd <vscale x 1 x double> %vp.load, %vp.load24
+  %16 = fmul <vscale x 1 x double> %vp.load, %vp.load24
+  %17 = select <vscale x 1 x i1> %12, <vscale x 1 x double> %15, <vscale x 1 x double> %16
+  %18 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %17, ptr %18, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %11)
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %vec.ind.next = add <vscale x 1 x i64> %vec.ind, %.splat
+  %19 = icmp eq i64 %index.next, %N
+  br i1 %19, label %for.end.loopexit26, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader25, %for.body
+  %I.019 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader25 ]
+  %cmp1 = icmp ult i64 %I.019, 50
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.019
+  %20 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx2 = getelementptr inbounds double, ptr %B, i64 %I.019
+  %21 = load double, ptr %arrayidx2, align 8, !tbaa !4
+  %add = fadd double %20, %21
+  %mul = fmul double %20, %21
+  %mul.sink = select i1 %cmp1, double %add, double %mul
+  %22 = getelementptr inbounds double, ptr %C, i64 %I.019
+  store double %mul.sink, ptr %22, align 8
+  %inc = add nuw nsw i64 %I.019, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit26:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit26, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
diff --git a/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll
new file mode 100644
index 000000000000..116d883572ee
--- /dev/null
+++ b/llvm/test/Transforms/VectorPredication/simple_vector_sum.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=vector-predication -o - < %s | FileCheck %s
+
+; ModuleID = 'custom/simple.c'
+source_filename = "custom/simple.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-unknown"
+
+; Input C code:
+; void addVec(long N, double *C, double *A, double *B) {
+;   long I;
+;   for (I = 0; I < N; I++)
+;     C[I] = A[I] + B[I];
+; }
+
+; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024)
+define dso_local void @addVec(i64 noundef %N, ptr nocapture noundef writeonly %C, ptr nocapture noundef readonly %A, ptr nocapture noundef readonly %B) local_unnamed_addr #0 {
+; CHECK-LABEL: @addVec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[B11:%.*]] = ptrtoint ptr [[B:%.*]] to i64
+; CHECK-NEXT:    [[A10:%.*]] = ptrtoint ptr [[A:%.*]] to i64
+; CHECK-NEXT:    [[C9:%.*]] = ptrtoint ptr [[C:%.*]] to i64
+; CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 10)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PREHEADER14:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       for.body.preheader14:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = sub i64 [[C9]], [[A10]]
+; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 [[C9]], [[B11]]
+; CHECK-NEXT:    [[DIFF_CHECK12:%.*]] = icmp ult i64 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK12]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER14]], label [[VECTOR_BODY_PREHEADER:%.*]]
+; CHECK:       vector.body.preheader:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[EVL_PHI:%.*]] = phi i64 [ [[EVL_NEXT:%.*]], [[VECTOR_BODY]] ], [ [[N]], [[VECTOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[EVL_PHI]], 4294967295
+; CHECK-NEXT:    [[VL:%.*]] = call i64 @llvm.riscv.vsetvli.i64(i64 [[TMP9]], i64 3, i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[VL]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP11]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VP_LOAD13:%.*]] = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr [[TMP12]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[VP_OP:%.*]] = call <vscale x 1 x double> @llvm.vp.fadd.nxv1f64(<vscale x 1 x double> [[VP_LOAD]], <vscale x 1 x double> [[VP_LOAD13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> [[VP_OP]], ptr [[TMP13]], <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 [[TMP10]]), !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[VL]]
+; CHECK-NEXT:    [[EVL_NEXT]] = sub i64 [[N]], [[INDEX_NEXT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_END_LOOPEXIT15:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER14]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[I_08]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load double, ptr [[ARRAYIDX1]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[I_08]]
+; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX2]], align 8, !tbaa [[TBAA4]]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_08]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end.loopexit15:
+; CHECK-NEXT:    br label [[FOR_END]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %B11 = ptrtoint ptr %B to i64
+  %A10 = ptrtoint ptr %A to i64
+  %C9 = ptrtoint ptr %C to i64
+  %cmp7 = icmp sgt i64 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %0 = xor i64 %N, -1
+  %1 = call i64 @llvm.vscale.i64()
+  %2 = call i64 @llvm.umax.i64(i64 %1, i64 10)
+  %3 = icmp ugt i64 %2, %0
+  br i1 %3, label %for.body.preheader14, label %vector.memcheck
+
+for.body.preheader14:                             ; preds = %vector.memcheck, %for.body.preheader
+  br label %for.body
+
+vector.memcheck:                                  ; preds = %for.body.preheader
+  %4 = call i64 @llvm.vscale.i64()
+  %5 = shl nuw nsw i64 %4, 3
+  %6 = sub i64 %C9, %A10
+  %diff.check = icmp ult i64 %6, %5
+  %7 = shl nuw nsw i64 %4, 3
+  %8 = sub i64 %C9, %B11
+  %diff.check12 = icmp ult i64 %8, %7
+  %conflict.rdx = or i1 %diff.check, %diff.check12
+  br i1 %conflict.rdx, label %for.body.preheader14, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body.preheader, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.body.preheader ]
+  %evl.phi = phi i64 [ %evl.next, %vector.body ], [ %N, %vector.body.preheader ]
+  %9 = and i64 %evl.phi, 4294967295
+  %vl = call i64 @llvm.riscv.vsetvli.i64(i64 %9, i64 3, i64 0)
+  %10 = trunc i64 %vl to i32
+  %11 = getelementptr inbounds double, ptr %A, i64 %index
+  %vp.load = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %11, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %12 = getelementptr inbounds double, ptr %B, i64 %index
+  %vp.load13 = call <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr %12, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %13 = fadd <vscale x 1 x double> %vp.load, %vp.load13
+  %14 = getelementptr inbounds double, ptr %C, i64 %index
+  call void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double> %13, ptr %14, <vscale x 1 x i1> shufflevector (<vscale x 1 x i1> insertelement (<vscale x 1 x i1> poison, i1 true, i64 0), <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer), i32 %10), !tbaa !4
+  %index.next = add i64 %index, %vl
+  %evl.next = sub i64 %N, %index.next
+  %15 = icmp eq i64 %index.next, %N
+  br i1 %15, label %for.end.loopexit15, label %vector.body, !llvm.loop !8
+
+for.body:                                         ; preds = %for.body.preheader14, %for.body
+  %I.08 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader14 ]
+  %arrayidx = getelementptr inbounds double, ptr %A, i64 %I.08
+  %16 = load double, ptr %arrayidx, align 8, !tbaa !4
+  %arrayidx1 = getelementptr inbounds double, ptr %B, i64 %I.08
+  %17 = load double, ptr %arrayidx1, align 8, !tbaa !4
+  %add = fadd double %16, %17
+  %arrayidx2 = getelementptr inbounds double, ptr %C, i64 %I.08
+  store double %add, ptr %arrayidx2, align 8, !tbaa !4
+  %inc = add nuw nsw i64 %I.08, 1
+  %exitcond.not = icmp eq i64 %inc, %N
+  br i1 %exitcond.not, label %for.end.loopexit, label %for.body, !llvm.loop !12
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end.loopexit15:                               ; preds = %vector.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit15, %for.end.loopexit, %entry
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i64 @llvm.vscale.i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.umax.i64(i64, i64) #2
+
+; Function Attrs: nounwind memory(none)
+declare i64 @llvm.riscv.vsetvli.i64(i64, i64 immarg, i64 immarg) #3
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare <vscale x 1 x i64> @llvm.experimental.stepvector.nxv1i64() #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
+declare <vscale x 1 x double> @llvm.vp.load.nxv1f64.p0(ptr nocapture, <vscale x 1 x i1>, i32) #4
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
+declare void @llvm.vp.store.nxv1f64.p0(<vscale x 1 x double>, ptr nocapture, <vscale x 1 x i1>, i32) #5
+
+attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) vscale_range(2,1024) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-zawrs,-experimental-zca,-experimental-zcb,-experimental-zcd,-experimental-zcf,-experimental-zihintntl,-experimental-ztso,-experimental-zvfh,-h,-save-restore,-svinval,-svnapot,-svpbmt,-xtheadba,-xtheadvdot,-xventanacondops,-zba,-zbb,-zbc,-zbkb,-zbkc,-zbkx,-zbs,-zdinx,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zicbom,-zicbop,-zicboz,-zihintpause,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-zmmul,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #3 = { nounwind memory(none) }
+attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
+attributes #5 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
+
+!llvm.module.flags = !{!0, !1, !2}
+!llvm.ident = !{!3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"target-abi", !"lp64d"}
+!2 = !{i32 8, !"SmallDataLimit", i32 8}
+!3 = !{!"clang version 17.0.0"}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"double", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = distinct !{!8, !9, !10, !11}
+!9 = !{!"llvm.loop.mustprogress"}
+!10 = !{!"llvm.loop.isvectorized", i32 1}
+!11 = !{!"llvm.loop.unroll.runtime.disable"}
+!12 = distinct !{!12, !9, !10}
-- 
GitLab