Skip to content
  1. Oct 28, 2006
  2. Oct 26, 2006
  3. Oct 25, 2006
  4. Oct 24, 2006
  5. Oct 23, 2006
  6. Oct 22, 2006
  7. Oct 20, 2006
  8. Oct 19, 2006
  9. Oct 17, 2006
  10. Oct 16, 2006
  11. Oct 12, 2006
  12. Oct 09, 2006
  13. Oct 05, 2006
    • Chris Lattner's avatar
      add a new SimplifyDemandedVectorElts method, which works similarly to · 2deeaeac
      Chris Lattner authored
      SimplifyDemandedBits.  The idea is that some operations can be simplified if
      not all of the computed elements are needed.  Some targets (like x86) have a
      large number of intrinsics that operate on a single element, but pass other
      elts through unmodified.  If those other elements are not needed, the
      intrinsics can be simplified to scalar operations, and insertelement ops can
      be removed.
      
      This turns (f.e.):
      
      ushort %Convert_sse(float %f) {
              %tmp = insertelement <4 x float> undef, float %f, uint 0                ; <<4 x float>> [#uses=1]
              %tmp10 = insertelement <4 x float> %tmp, float 0.000000e+00, uint 1             ; <<4 x float>> [#uses=1]
              %tmp11 = insertelement <4 x float> %tmp10, float 0.000000e+00, uint 2           ; <<4 x float>> [#uses=1]
              %tmp12 = insertelement <4 x float> %tmp11, float 0.000000e+00, uint 3           ; <<4 x float>> [#uses=1]
              %tmp28 = tail call <4 x float> %llvm.x86.sse.sub.ss( <4 x float> %tmp12, <4 x float> < float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
              %tmp37 = tail call <4 x float> %llvm.x86.sse.mul.ss( <4 x float> %tmp28, <4 x float> < float 5.000000e-01, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
              %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp37, <4 x float> < float 6.553500e+04, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00 > )               ; <<4 x float>> [#uses=1]
              %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> zeroinitializer )          ; <<4 x float>> [#uses=1]
              %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )              ; <int> [#uses=1]
              %tmp69 = cast int %tmp to ushort                ; <ushort> [#uses=1]
              ret ushort %tmp69
      }
      
      into:
      
      ushort %Convert_sse(float %f) {
      entry:
              %tmp28 = sub float %f, 1.000000e+00             ; <float> [#uses=1]
              %tmp37 = mul float %tmp28, 5.000000e-01         ; <float> [#uses=1]
              %tmp375 = insertelement <4 x float> undef, float %tmp37, uint 0         ; <<4 x float>> [#uses=1]
              %tmp48 = tail call <4 x float> %llvm.x86.sse.min.ss( <4 x float> %tmp375, <4 x float> < float 6.553500e+04, float undef, float undef, float undef > )           ; <<4 x float>> [#uses=1]
              %tmp59 = tail call <4 x float> %llvm.x86.sse.max.ss( <4 x float> %tmp48, <4 x float> < float 0.000000e+00, float undef, float undef, float undef > )            ; <<4 x float>> [#uses=1]
              %tmp = tail call int %llvm.x86.sse.cvttss2si( <4 x float> %tmp59 )              ; <int> [#uses=1]
              %tmp69 = cast int %tmp to ushort                ; <ushort> [#uses=1]
              ret ushort %tmp69
      }
      
      which improves codegen from:
      
      _Convert_sse:
              movss LCPI1_0, %xmm0
              movss 4(%esp), %xmm1
              subss %xmm0, %xmm1
              movss LCPI1_1, %xmm0
              mulss %xmm0, %xmm1
              movss LCPI1_2, %xmm0
              minss %xmm0, %xmm1
              xorps %xmm0, %xmm0
              maxss %xmm0, %xmm1
              cvttss2si %xmm1, %eax
              andl $65535, %eax
              ret
      
      to:
      
      _Convert_sse:
              movss 4(%esp), %xmm0
              subss LCPI1_0, %xmm0
              mulss LCPI1_1, %xmm0
              movss LCPI1_2, %xmm1
              minss %xmm1, %xmm0
              xorps %xmm1, %xmm1
              maxss %xmm1, %xmm0
              cvttss2si %xmm0, %eax
              andl $65535, %eax
              ret
      
      
      This is just a first step, it can be extended in many ways.  Testcase here:
      Transforms/InstCombine/vec_demanded_elts.ll
      
      llvm-svn: 30752
      2deeaeac
  14. Oct 04, 2006
  15. Oct 03, 2006
  16. Oct 01, 2006
Loading