diff --git a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp index 4a2c8781957c6db84f5457791c9d52c6df93e463..f9e9b3fbc4e783490c186087fd7ea31b9fcfc3aa 100644 --- a/llvm/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/BBVectorize.cpp @@ -642,7 +642,7 @@ namespace { Function *F = I->getCalledFunction(); if (!F) return false; - unsigned IID = F->getIntrinsicID(); + Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); if (!IID) return false; switch(IID) { @@ -1020,14 +1020,67 @@ namespace { // vectorized, the second arguments must be equal. CallInst *CI = dyn_cast(I); Function *FI; - if (CI && (FI = CI->getCalledFunction()) && - FI->getIntrinsicID() == Intrinsic::powi) { - - Value *A1I = CI->getArgOperand(1), - *A1J = cast(J)->getArgOperand(1); - const SCEV *A1ISCEV = SE->getSCEV(A1I), - *A1JSCEV = SE->getSCEV(A1J); - return (A1ISCEV == A1JSCEV); + if (CI && (FI = CI->getCalledFunction())) { + Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); + if (IID == Intrinsic::powi) { + Value *A1I = CI->getArgOperand(1), + *A1J = cast(J)->getArgOperand(1); + const SCEV *A1ISCEV = SE->getSCEV(A1I), + *A1JSCEV = SE->getSCEV(A1J); + return (A1ISCEV == A1JSCEV); + } + + if (IID && VTTI) { + SmallVector Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CI->getArgOperand(i)->getType()); + unsigned ICost = VTTI->getIntrinsicInstrCost(IID, IT1, Tys); + + Tys.clear(); + CallInst *CJ = cast(J); + for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CJ->getArgOperand(i)->getType()); + unsigned JCost = VTTI->getIntrinsicInstrCost(IID, JT1, Tys); + + Tys.clear(); + assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && + "Intrinsic argument counts differ"); + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (IID == Intrinsic::powi && i == 1) + Tys.push_back(CI->getArgOperand(i)->getType()); + else + Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), + CJ->getArgOperand(i)->getType())); + } + + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned VCost = VTTI->getIntrinsicInstrCost(IID, RetTy, Tys); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned RetParts = VTTI->getNumberOfParts(RetTy); + if (RetParts > 1) + return false; + else if (!RetParts && VCost == ICost + JCost) + return false; + + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (!Tys[i]->isVectorTy()) + continue; + + unsigned NumParts = VTTI->getNumberOfParts(Tys[i]); + if (NumParts > 1) + return false; + else if (!NumParts && VCost == ICost + JCost) + return false; + } + + CostSavings = ICost + JCost - VCost; + } } return true; @@ -2551,7 +2604,7 @@ namespace { continue; } else if (isa(I)) { Function *F = cast(I)->getCalledFunction(); - unsigned IID = F->getIntrinsicID(); + Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); if (o == NumOperands-1) { BasicBlock &BB = *I->getParent(); @@ -2560,8 +2613,7 @@ namespace { Type *ArgTypeJ = J->getType(); Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - ReplacedOperands[o] = Intrinsic::getDeclaration(M, - (Intrinsic::ID) IID, VArgType); + ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType); continue; } else if (IID == Intrinsic::powi && o == 1) { // The second argument of powi is a single integer and we've already diff --git a/llvm/test/Transforms/BBVectorize/X86/simple-int.ll b/llvm/test/Transforms/BBVectorize/X86/simple-int.ll new file mode 100644 index 0000000000000000000000000000000000000000..f5dbe46b1480b71c52be7678f56118faff7da1b1 --- /dev/null +++ b/llvm/test/Transforms/BBVectorize/X86/simple-int.ll @@ -0,0 +1,79 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +declare double @llvm.fma.f64(double, double, double) +declare double @llvm.fmuladd.f64(double, double, double) +declare double @llvm.cos.f64(double) +declare double @llvm.powi.f64(double, i32) + +; Basic depth-3 chain with fma +define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1) + %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test1 +; CHECK: ret double %R +} + +; Basic depth-3 chain with fmuladd +define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1) + %Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test1a +; CHECK: ret double %R +} + +; Basic depth-3 chain with cos +define double @test2(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.cos.f64(double %X1) + %Y2 = call double @llvm.cos.f64(double %X2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test2 +; CHECK: ret double %R +} + +; Basic depth-3 chain with powi +define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) + %Y2 = call double @llvm.powi.f64(double %X2, i32 %P) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test3 +; CHECK: ret double %R +} + +; Basic depth-3 chain with powi (different powers: should not vectorize) +define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %P2 = add i32 %P, 1 + %Y1 = call double @llvm.powi.f64(double %X1, i32 %P) + %Y2 = call double @llvm.powi.f64(double %X2, i32 %P2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test4 +; CHECK: ret double %R +} +