diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt index 4d3dedf0e5f1e1202271e08dcc34c63c3db3b9ae..22285f1932347666e2e8ec5b82ad5b2226e44004 100644 --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -15,6 +15,7 @@ tablegen(X86GenCallingConv.inc -gen-callingconv) tablegen(X86GenSubtarget.inc -gen-subtarget) set(sources + SSEDomainFix.cpp X86AsmBackend.cpp X86CodeEmitter.cpp X86COFFMachineModuleInfo.cpp diff --git a/llvm/lib/Target/X86/SSEDomainFix.cpp b/llvm/lib/Target/X86/SSEDomainFix.cpp new file mode 100644 index 0000000000000000000000000000000000000000..261b40c5abccbfd68a8ace4a45524bee013c1ccc --- /dev/null +++ b/llvm/lib/Target/X86/SSEDomainFix.cpp @@ -0,0 +1,98 @@ +//===- SSEDomainFix.cpp - Use proper int/float domain for SSE ---*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the SSEDomainFix pass. +// +// Some SSE instructions like mov, and, or, xor are available in different +// variants for different operand types. These variant instructions are +// equivalent, but on Nehalem and newer cpus there is extra latency +// transferring data between integer and floating point domains. +// +// This pass changes the variant instructions to minimize domain crossings. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "sse-domain-fix" +#include "X86InstrInfo.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { +class SSEDomainFixPass : public MachineFunctionPass { + static char ID; + const X86InstrInfo *TII; + + MachineFunction *MF; + MachineBasicBlock *MBB; +public: + SSEDomainFixPass() : MachineFunctionPass(&ID) {} + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + virtual const char *getPassName() const { + return "SSE execution domain fixup"; + } + +private: + void enterBasicBlock(MachineBasicBlock *MBB); +}; +} + +char SSEDomainFixPass::ID = 0; + +void SSEDomainFixPass::enterBasicBlock(MachineBasicBlock *mbb) { + MBB = mbb; + DEBUG(dbgs() << "Entering MBB " << MBB->getName() << "\n"); +} + +bool SSEDomainFixPass::runOnMachineFunction(MachineFunction &mf) { + MF = &mf; + TII = static_cast(MF->getTarget().getInstrInfo()); + + // If no XMM registers are used in the function, we can skip it completely. + bool XMMIsUsed = false; + for (TargetRegisterClass::const_iterator I = X86::VR128RegClass.begin(), + E = X86::VR128RegClass.end(); I != E; ++I) + if (MF->getRegInfo().isPhysRegUsed(*I)) { + XMMIsUsed = true; + break; + } + if (!XMMIsUsed) return false; + + MachineBasicBlock *Entry = MF->begin(); + SmallPtrSet Visited; + for (df_ext_iterator > + DFI = df_ext_begin(Entry, Visited), DFE = df_ext_end(Entry, Visited); + DFI != DFE; ++DFI) { + enterBasicBlock(*DFI); + for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; + ++I) { + MachineInstr *MI = I; + const unsigned *equiv = 0; + X86InstrInfo::SSEDomain domain = TII->GetSSEDomain(MI, equiv); + DEBUG(dbgs() << "-isd"[domain] << (equiv ? "* " : " ") << *MI); + } + } + return false; +} + +FunctionPass *llvm::createSSEDomainFixPass() { + return new SSEDomainFixPass(); +} diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index c753cf2a530f98c0c3e91fc01301b7986375777f..9be38a4b56a988d98b1377106cb1c3d66a0fdfd7 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -41,6 +41,10 @@ FunctionPass *createX86ISelDag(X86TargetMachine &TM, /// FunctionPass *createX86FloatingPointStackifierPass(); +/// createSSEDomainFixPass - This pass twiddles SSE opcodes to prevent domain +/// crossings. +FunctionPass *createSSEDomainFixPass(); + /// createX87FPRegKillInserterPass - This function returns a pass which /// inserts FP_REG_KILL instructions where needed. /// diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 2be51e1a13669493423c31920fa7c6662d2807ed..5788e2a71f647e626682bf37baa682d87cad87b6 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -164,6 +164,7 @@ def X86InstrInfo : InstrInfo { "FPFormBits", "hasLockPrefix", "SegOvrBits", + "DomainBits", "Opcode"]; let TSFlagsShifts = [0, 6, @@ -174,6 +175,7 @@ def X86InstrInfo : InstrInfo { 16, 19, 20, + 22, 24]; } diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td index c06b81b10a19f65941952ce5e4c6e3dd1a9050fe..a811638c39ef2f5d8f4a1844d6cec82302705655 100644 --- a/llvm/lib/Target/X86/X86InstrFormats.td +++ b/llvm/lib/Target/X86/X86InstrFormats.td @@ -68,6 +68,16 @@ def CompareFP : FPFormat<5>; def CondMovFP : FPFormat<6>; def SpecialFP : FPFormat<7>; +// Class specifying the SSE execution domain, used by the SSEDomainFix pass. +// Instruction execution domain. +class Domain val> { + bits<2> Value = val; +} +def GenericDomain : Domain<0>; +def SSEPackedInt : Domain<1>; +def SSEPackedSingle : Domain<2>; +def SSEPackedDouble : Domain<3>; + // Prefix byte classes which are used to indicate to the ad-hoc machine code // emitter that various prefix bytes are required. class OpSize { bit hasOpSizePrefix = 1; } @@ -93,7 +103,7 @@ class TA { bits<4> Prefix = 14; } class TF { bits<4> Prefix = 15; } class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, - string AsmStr> + string AsmStr, Domain d = GenericDomain> : Instruction { let Namespace = "X86"; @@ -119,16 +129,19 @@ class X86Inst opcod, Format f, ImmType i, dag outs, dag ins, bits<3> FPFormBits = 0; bit hasLockPrefix = 0; // Does this inst have a 0xF0 prefix? bits<2> SegOvrBits = 0; // Segment override prefix. + Domain Dom = d; + bits<2> DomainBits = Dom.Value; } -class I o, Format f, dag outs, dag ins, string asm, list pattern> - : X86Inst { +class I o, Format f, dag outs, dag ins, string asm, + list pattern, Domain d = GenericDomain> + : X86Inst { let Pattern = pattern; let CodeSize = 3; } class Ii8 o, Format f, dag outs, dag ins, string asm, - list pattern> - : X86Inst { + list pattern, Domain d = GenericDomain> + : X86Inst { let Pattern = pattern; let CodeSize = 3; } @@ -196,14 +209,16 @@ class Iseg32 o, Format f, dag outs, dag ins, string asm, class SSI o, Format F, dag outs, dag ins, string asm, list pattern> : I, XS, Requires<[HasSSE1]>; -class SSIi8 o, Format F, dag outs, dag ins, string asm, +class SSIi8 o, Format F, dag outs, dag ins, string asm, list pattern> : Ii8, XS, Requires<[HasSSE1]>; class PSI o, Format F, dag outs, dag ins, string asm, list pattern> - : I, TB, Requires<[HasSSE1]>; + : I, TB, + Requires<[HasSSE1]>; class PSIi8 o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, TB, Requires<[HasSSE1]>; + : Ii8, TB, + Requires<[HasSSE1]>; // SSE2 Instruction Templates: // @@ -222,10 +237,12 @@ class SSDIi8 o, Format F, dag outs, dag ins, string asm, list pattern> : Ii8, XS, Requires<[HasSSE2]>; class PDI o, Format F, dag outs, dag ins, string asm, list pattern> - : I, TB, OpSize, Requires<[HasSSE2]>; + : I, TB, OpSize, + Requires<[HasSSE2]>; class PDIi8 o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, TB, OpSize, Requires<[HasSSE2]>; + : Ii8, TB, OpSize, + Requires<[HasSSE2]>; // SSE3 Instruction Templates: // @@ -235,12 +252,15 @@ class PDIi8 o, Format F, dag outs, dag ins, string asm, class S3SI o, Format F, dag outs, dag ins, string asm, list pattern> - : I, XS, Requires<[HasSSE3]>; + : I, XS, + Requires<[HasSSE3]>; class S3DI o, Format F, dag outs, dag ins, string asm, list pattern> - : I, XD, Requires<[HasSSE3]>; + : I, XD, + Requires<[HasSSE3]>; class S3I o, Format F, dag outs, dag ins, string asm, list pattern> - : I, TB, OpSize, Requires<[HasSSE3]>; + : I, TB, OpSize, + Requires<[HasSSE3]>; // SSSE3 Instruction Templates: @@ -254,10 +274,12 @@ class S3I o, Format F, dag outs, dag ins, string asm, list pattern> class SS38I o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, T8, Requires<[HasSSSE3]>; + : Ii8, T8, + Requires<[HasSSSE3]>; class SS3AI o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, TA, Requires<[HasSSSE3]>; + : Ii8, TA, + Requires<[HasSSSE3]>; // SSE4.1 Instruction Templates: // @@ -266,17 +288,20 @@ class SS3AI o, Format F, dag outs, dag ins, string asm, // class SS48I o, Format F, dag outs, dag ins, string asm, list pattern> - : I, T8, Requires<[HasSSE41]>; + : I, T8, + Requires<[HasSSE41]>; class SS4AIi8 o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, TA, Requires<[HasSSE41]>; + : Ii8, TA, + Requires<[HasSSE41]>; // SSE4.2 Instruction Templates: // // SS428I - SSE 4.2 instructions with T8 prefix. class SS428I o, Format F, dag outs, dag ins, string asm, list pattern> - : I, T8, Requires<[HasSSE42]>; + : I, T8, + Requires<[HasSSE42]>; // SS42FI - SSE 4.2 instructions with TF prefix. class SS42FI o, Format F, dag outs, dag ins, string asm, @@ -286,7 +311,8 @@ class SS42FI o, Format F, dag outs, dag ins, string asm, // SS42AI = SSE 4.2 instructions with TA prefix class SS42AI o, Format F, dag outs, dag ins, string asm, list pattern> - : Ii8, TA, Requires<[HasSSE42]>; + : Ii8, TA, + Requires<[HasSSE42]>; // X86-64 Instruction templates... // diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 2323f5790ab0f6d1534600b828e9a3bf4bcdeaaf..eeb020ba3b73ab89fffba3ccddc357b2171c90e2 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3658,3 +3658,46 @@ unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { X86FI->setGlobalBaseReg(GlobalBaseReg); return GlobalBaseReg; } + +X86InstrInfo::SSEDomain X86InstrInfo::GetSSEDomain(const MachineInstr *MI, + const unsigned *&equiv) const { + // These are the replaceable SSE instructions. Some of these have Int variants + // that we don't include here. We don't want to replace instructions selected + // by intrinsics. + static const unsigned ReplaceableInstrs[][3] = { + //PackedInt PackedSingle PackedDouble + { X86::MOVDQAmr, X86::MOVAPSmr, X86::MOVAPDmr }, + { X86::MOVDQArm, X86::MOVAPSrm, X86::MOVAPDrm }, + { X86::MOVDQArr, X86::MOVAPSrr, X86::MOVAPDrr }, + { X86::MOVDQUmr, X86::MOVUPSmr, X86::MOVUPDmr }, + { X86::MOVDQUrm, X86::MOVUPSrm, X86::MOVUPDrm }, + { X86::MOVNTDQmr, X86::MOVNTPSmr, X86::MOVNTPDmr }, + { X86::PANDNrm, X86::ANDNPSrm, X86::ANDNPDrm }, + { X86::PANDNrr, X86::ANDNPSrr, X86::ANDNPDrr }, + { X86::PANDrm, X86::ANDPSrm, X86::ANDPDrm }, + { X86::PANDrr, X86::ANDPSrr, X86::ANDPDrr }, + { X86::PORrm, X86::ORPSrm, X86::ORPDrm }, + { X86::PORrr, X86::ORPSrr, X86::ORPDrr }, + { X86::PUNPCKHQDQrm, X86::UNPCKHPSrm, X86::UNPCKHPDrm }, + { X86::PUNPCKHQDQrr, X86::UNPCKHPSrr, X86::UNPCKHPDrr }, + { X86::PUNPCKLQDQrm, X86::UNPCKLPSrm, X86::UNPCKLPDrm }, + { X86::PUNPCKLQDQrr, X86::UNPCKLPSrr, X86::UNPCKLPDrr }, + { X86::PXORrm, X86::XORPSrm, X86::XORPDrm }, + { X86::PXORrr, X86::XORPSrr, X86::XORPDrr }, + }; + + const SSEDomain domain = + SSEDomain((MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3); + if (domain == NotSSEDomain) + return domain; + + // Linear search FTW! + const unsigned opc = MI->getOpcode(); + for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) + if (ReplaceableInstrs[i][domain-1] == opc) { + equiv = ReplaceableInstrs[i]; + return domain; + } + equiv = 0; + return domain; +} diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5111719a2094949934d874f2a34cf71b53dd57cc..965740dcaf0602906853199f476aed22870522e6 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -398,7 +398,10 @@ namespace X86II { FS = 1 << SegOvrShift, GS = 2 << SegOvrShift, - // Bits 22 -> 23 are unused + // Execution domain for SSE instructions in bits 22, 23. + // 0 in bits 22-23 means normal, non-SSE instruction. See SSEDomain below. + SSEDomainShift = 22, + OpcodeShift = 24, OpcodeMask = 0xFF << OpcodeShift }; @@ -486,7 +489,7 @@ class X86InstrInfo : public TargetInstrInfoImpl { /// MemOp2RegOpTable - Load / store unfolding opcode map. /// DenseMap > MemOp2RegOpTable; - + public: explicit X86InstrInfo(X86TargetMachine &tm); @@ -716,6 +719,15 @@ public: /// unsigned getGlobalBaseReg(MachineFunction *MF) const; + /// Some SSE instructions come in variants for three domains. + enum SSEDomain { NotSSEDomain, PackedInt, PackedSingle, PackedDouble }; + + /// GetSSEDomain - Return the SSE execution domain of MI, or NotSSEDomain for + /// unknown instructions. If the instruction has equivalents for other + /// domains, equiv points to a list of opcodes for [PackedInt, PackedSingle, + /// PackedDouble]. + SSEDomain GetSSEDomain(const MachineInstr *MI, const unsigned *&equiv) const; + private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index f13e6f35256466e2a1747a3f2079b0ddd038a883..06a481de258dd71dd0df3434a6ab70b56183e1b8 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -17,11 +17,17 @@ #include "llvm/PassManager.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Target/TargetRegistry.h" using namespace llvm; +static cl::opt +SSEDomainFix("sse-domain-fix", + cl::desc("Enable fixing of SSE execution domain"), + cl::init(false), cl::Hidden); + static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) { Triple TheTriple(TT); switch (TheTriple.getOS()) { @@ -169,6 +175,15 @@ bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM, return true; // -print-machineinstr should print after this. } +bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (SSEDomainFix && OptLevel != CodeGenOpt::None && Subtarget.hasSSE2()) { + PM.add(createSSEDomainFixPass()); + return true; + } + return false; +} + bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, JITCodeEmitter &JCE) { diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h index 2bb54544d408b8ee2d88aaa525c58f103fe35767..ae7b5b29af1456948882bc2cbe41e3bcb1590641 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.h +++ b/llvm/lib/Target/X86/X86TargetMachine.h @@ -66,6 +66,7 @@ public: virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); + virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel, JITCodeEmitter &JCE); };